From f2b91d8d385d2cef3a1e3b3846f2dde4a6720c43 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 15 Apr 2015 16:12:51 -0700 Subject: vfs: delete vfs_readdir function declaration vfs_readdir() was replaced by iterate_dir() in commit 5c0ba4e0762e ("[readdir] introduce iterate_dir() and dir_context"). Signed-off-by: Zhang Zhen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d502e5436c84..60733bdc74b4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2684,7 +2684,6 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -extern int vfs_readdir(struct file *, filldir_t, void *); extern int iterate_dir(struct file *, struct dir_context *); extern int vfs_stat(const char __user *, struct kstat *); -- cgit v1.2.3 From bdddbcd45fd191a0213e6d2a032eb55d18bd1fc0 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 15 Apr 2015 16:12:54 -0700 Subject: mm/oom_kill.c: fix typo in comment Alter 'taks' -> 'task' Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 52628c819bf7..2b665da1b3c9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly; static DECLARE_RWSEM(oom_sem); /** - * mark_tsk_oom_victim - marks the given taks as OOM victim. + * mark_tsk_oom_victim - marks the given task as OOM victim. * @tsk: task to mark * * Has to be called with oom_sem taken for read and never after -- cgit v1.2.3 From d7e4a2ea51c1b0ac0b2d53f5ab4a2e878b1c4aec Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 15 Apr 2015 16:12:57 -0700 Subject: mm: refactor zone_movable_is_highmem() All callers of zone_movable_is_highmem are under #ifdef CONFIG_HIGHMEM, so the else branch return 0 is not needed. Signed-off-by: Zhang Zhen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2782df47101e..54d74f6eb233 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -842,16 +842,16 @@ static inline int populated_zone(struct zone *zone) extern int movable_zone; +#ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP return movable_zone == ZONE_HIGHMEM; -#elif defined(CONFIG_HIGHMEM) - return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #else - return 0; + return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #endif } +#endif static inline int is_highmem_idx(enum zone_type idx) { -- cgit v1.2.3 From adbe427b92d18cf3f801e82e9cd664fbe31cea3c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 15 Apr 2015 16:13:00 -0700 Subject: memcg: zap mem_cgroup_lookup() mem_cgroup_lookup() is a wrapper around mem_cgroup_from_id(), which checks that id != 0 before issuing the function call. Today, there is no point in this additional check apart from optimization, because there is no css with id <= 0, so that css_from_id, called by mem_cgroup_from_id, will return NULL for any id <= 0. Since mem_cgroup_from_id is only called from mem_cgroup_lookup, let us zap mem_cgroup_lookup, substituting calls to it with mem_cgroup_from_id and moving the check if id > 0 to css_from_id. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- mm/memcontrol.c | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a220fdb66568..59aa339a245c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5451,7 +5451,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&ss->css_idr, id); + return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } #ifdef CONFIG_CGROUP_DEBUG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3f09b2dda5f..0a06628470cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -460,6 +460,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) return memcg->css.id; } +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) + */ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; @@ -2348,20 +2354,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) -{ - /* ID 0 is unused ID */ - if (!id) - return NULL; - return mem_cgroup_from_id(id); -} - /* * try_get_mem_cgroup_from_page - look up page's memcg association * @page: the page @@ -2388,7 +2380,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); @@ -5869,7 +5861,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) id = swap_cgroup_record(entry, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); -- cgit v1.2.3 From 2564f683d1a6cb76893199ce68e6484725621e49 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 15 Apr 2015 16:13:03 -0700 Subject: memcg: remove obsolete comment Low and high watermarks, as they defined in the TODO to the mem_cgroup struct, have already been implemented by Johannes, so remove the stale comment. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a06628470cc..74a9641d8f9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -259,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. - * - * TODO: Add a water mark for the memory controller. Reclaim will begin when - * we hit the water mark. May be even add a low water mark, such that - * no reclaim occurs from a cgroup at it's low water mark, this is - * a feature that will be implemented much later in the future. */ struct mem_cgroup { struct cgroup_subsys_state css; -- cgit v1.2.3 From 64d37a2baf5e5c0f1009c0ef290a9027de721d66 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:13:05 -0700 Subject: mm/memory-failure.c: define page types for action_result() in one place This cleanup patch moves all strings passed to action_result() into a singl= e array action_page_type so that a reader can easily find which kind of actio= n results are possible. And this patch also fixes the odd lines to be printed out, like "unknown page state page" or "free buddy, 2nd try page". [akpm@linux-foundation.org: rename messages, per David] [akpm@linux-foundation.org: s/DIRTY_UNEVICTABLE_LRU/CLEAN_UNEVICTABLE_LRU', per Andi] Signed-off-by: Naoya Horiguchi Reviewed-by: Andi Kleen Cc: Tony Luck Cc: "Xie XiuQi" Cc: Steven Rostedt Cc: Chen Gong Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 108 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d487f8dc6d39..5fd8931d8c31 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -521,6 +521,52 @@ static const char *action_name[] = { [RECOVERED] = "Recovered", }; +enum action_page_type { + MSG_KERNEL, + MSG_KERNEL_HIGH_ORDER, + MSG_SLAB, + MSG_DIFFERENT_COMPOUND, + MSG_POISONED_HUGE, + MSG_HUGE, + MSG_FREE_HUGE, + MSG_UNMAP_FAILED, + MSG_DIRTY_SWAPCACHE, + MSG_CLEAN_SWAPCACHE, + MSG_DIRTY_MLOCKED_LRU, + MSG_CLEAN_MLOCKED_LRU, + MSG_DIRTY_UNEVICTABLE_LRU, + MSG_CLEAN_UNEVICTABLE_LRU, + MSG_DIRTY_LRU, + MSG_CLEAN_LRU, + MSG_TRUNCATED_LRU, + MSG_BUDDY, + MSG_BUDDY_2ND, + MSG_UNKNOWN, +}; + +static const char * const action_page_types[] = { + [MSG_KERNEL] = "reserved kernel page", + [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MSG_SLAB] = "kernel slab page", + [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MSG_HUGE] = "huge page", + [MSG_FREE_HUGE] = "free huge page", + [MSG_UNMAP_FAILED] = "unmapping failed page", + [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MSG_DIRTY_LRU] = "dirty LRU page", + [MSG_CLEAN_LRU] = "clean LRU page", + [MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MSG_BUDDY] = "free buddy page", + [MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MSG_UNKNOWN] = "unknown page", +}; + /* * XXX: It is possible that a page is isolated from LRU cache, * and then kept in swap cache or failed to remove from page cache. @@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) static struct page_state { unsigned long mask; unsigned long res; - char *msg; + enum action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_kernel }, + { reserved, reserved, MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -791,31 +837,31 @@ static struct page_state { * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, "kernel slab", me_kernel }, + { slab, slab, MSG_SLAB, me_kernel }, #ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, "huge", me_huge_page }, - { tail, tail, "huge", me_huge_page }, + { head, head, MSG_HUGE, me_huge_page }, + { tail, tail, MSG_HUGE, me_huge_page }, #else - { compound, compound, "huge", me_huge_page }, + { compound, compound, MSG_HUGE, me_huge_page }, #endif - { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, - { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, "unknown page state", me_unknown }, + { 0, 0, MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -835,10 +881,10 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, char *msg, int result) +static void action_result(unsigned long pfn, enum action_page_type type, int result) { - pr_err("MCE %#lx: %s page recovery: %s\n", - pfn, msg, action_name[result]); + pr_err("MCE %#lx: recovery action for %s: %s\n", + pfn, action_page_types[type], action_name[result]); } static int page_action(struct page_state *ps, struct page *p, @@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p, count--; if (count != 0) { printk(KERN_ERR - "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, count); + "MCE %#lx: %s still referenced by %d users\n", + pfn, action_page_types[ps->type], count); result = FAILED; } - action_result(pfn, ps->msg, result); + action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ /* @@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!(flags & MF_COUNT_INCREASED) && !get_page_unless_zero(hpage)) { if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy", DELAYED); + action_result(pfn, MSG_BUDDY, DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, "free huge", + action_result(pfn, MSG_FREE_HUGE, res ? IGNORED : DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, "high order kernel", IGNORED); + action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); return -EBUSY; } } @@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (is_free_buddy_page(p)) { if (flags & MF_COUNT_INCREASED) - action_result(pfn, "free buddy", DELAYED); + action_result(pfn, MSG_BUDDY, DELAYED); else - action_result(pfn, "free buddy, 2nd try", DELAYED); + action_result(pfn, MSG_BUDDY_2ND, + DELAYED); return 0; } } @@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * If this happens just bail out. */ if (compound_head(p) != hpage) { - action_result(pfn, "different compound page after locking", IGNORED); + action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); res = -EBUSY; goto out; } @@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, "hugepage already hardware poisoned", - IGNORED); + action_result(pfn, MSG_POISONED_HUGE, IGNORED); unlock_page(hpage); put_page(hpage); return 0; @@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - action_result(pfn, "unmapping failed", IGNORED); + action_result(pfn, MSG_UNMAP_FAILED, IGNORED); res = -EBUSY; goto out; } @@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, "already truncated LRU", IGNORED); + action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); res = -EBUSY; goto out; } -- cgit v1.2.3 From e8c6158fef15a1532bd5242a0cd88565eedabe61 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:13:08 -0700 Subject: mm: consolidate all page-flags helpers in Currently we take a naive approach to page flags on compound pages - we set the flag on the page without consideration if the flag makes sense for tail page or for compound page in general. This patchset try to sort this out by defining per-flag policy on what need to be done if page-flag helper operate on compound page. The last patch in the patchset also sanitizes usege of page->mapping for tail pages. We don't define the meaning of page->mapping for tail pages. Currently it's always NULL, which can be inconsistent with head page and potentially lead to problems. For now I caught one case of illegal usage of page flags or ->mapping: sound subsystem allocates pages with __GFP_COMP and maps them with PTEs. It leads to setting dirty bit on tail pages and access to tail_page's ->mapping. I don't see any bad behaviour caused by this, but worth fixing anyway. This patchset makes more sense if you take my THP refcounting into account: we will see more compound pages mapped with PTEs and we need to define behaviour of flags on compound pages to avoid bugs. This patch (of 16): We have page-flags helper function declarations/definitions spread over several header files. Let's consolidate them in . Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 7 ---- include/linux/ksm.h | 17 -------- include/linux/mm.h | 81 -------------------------------------- include/linux/page-flags.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 105 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7b5785032049..1a782733a420 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,8 +41,6 @@ extern int hugetlb_max_hstate __read_mostly; struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); -int PageHuge(struct page *page); - void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -109,11 +107,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ -static inline int PageHuge(struct page *page) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 3be6bb18562d..7ae216a39c9e 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -35,18 +35,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -/* - * A KSM page is one of those write-protected "shared pages" or "merged pages" - * which KSM maps into multiple mms, wherever identical anonymous page content - * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any - * anon_vma, but to that page's node of the stable tree. - */ -static inline int PageKsm(struct page *page) -{ - return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); -} - static inline struct stable_node *page_stable_node(struct page *page) { return PageKsm(page) ? page_rmapping(page) : NULL; @@ -87,11 +75,6 @@ static inline void ksm_exit(struct mm_struct *mm) { } -static inline int PageKsm(struct page *page) -{ - return 0; -} - #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6571dd78e984..fb1fc38b01ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -494,15 +494,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -#ifdef CONFIG_HUGETLB_PAGE -extern int PageHeadHuge(struct page *page_head); -#else /* CONFIG_HUGETLB_PAGE */ -static inline int PageHeadHuge(struct page *page_head) -{ - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE */ - static inline bool __compound_tail_refcounted(struct page *page) { return !PageSlab(page) && !PageHeadHuge(page); @@ -571,53 +562,6 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -/* - * PageBuddy() indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * - * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to - * -2 so that an underflow of the page_mapcount() won't be mistaken - * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very - * efficiently by most CPU architectures. - */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) - -static inline int PageBuddy(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; -} - -static inline void __SetPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); -} - -static inline void __ClearPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBuddy(page), page); - atomic_set(&page->_mapcount, -1); -} - -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) - -static inline int PageBalloon(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; -} - -static inline void __SetPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); -} - -static inline void __ClearPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBalloon(page), page); - atomic_set(&page->_mapcount, -1); -} - void put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -1006,26 +950,6 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif -/* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; - * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. - * - * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, - * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; - * and then page->mapping points, not to an anon_vma, but to a private - * structure which KSM associates with that merged page. See ksm.h. - * - * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. - * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. - */ -#define PAGE_MAPPING_ANON 1 -#define PAGE_MAPPING_KSM 2 -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) - extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ @@ -1045,11 +969,6 @@ struct address_space *page_file_mapping(struct page *page) return page->mapping; } -static inline int PageAnon(struct page *page) -{ - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; -} - /* * Return the pagecache index of the passed page. Regular pagecache pages * use ->index whereas swapcache pages use ->private diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c851ff92d5b3..84d10b65cec6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -289,6 +289,47 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +/* + * On an anonymous page mapped into a user virtual memory area, + * page->mapping points to its anon_vma, not to a struct address_space; + * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. + * + * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, + * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; + * and then page->mapping points, not to an anon_vma, but to a private + * structure which KSM associates with that merged page. See ksm.h. + * + * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. + * + * Please note that, confusingly, "page_mapping" refers to the inode + * address_space which maps the page from disk; whereas "page_mapped" + * refers to user virtual address space into which the page is mapped. + */ +#define PAGE_MAPPING_ANON 1 +#define PAGE_MAPPING_KSM 2 +#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) + +static inline int PageAnon(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + +#ifdef CONFIG_KSM +/* + * A KSM page is one of those write-protected "shared pages" or "merged pages" + * which KSM maps into multiple mms, wherever identical anonymous page content + * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any + * anon_vma, but to that page's node of the stable tree. + */ +static inline int PageKsm(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); +} +#else +TESTPAGEFLAG_FALSE(Ksm) +#endif + u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) @@ -426,6 +467,14 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_HUGETLB_PAGE +int PageHuge(struct page *page); +int PageHeadHuge(struct page *page); +#else +TESTPAGEFLAG_FALSE(Huge) +TESTPAGEFLAG_FALSE(HeadHuge) +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -479,6 +528,53 @@ static inline int PageTransTail(struct page *page) } #endif +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + * + * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to + * -2 so that an underflow of the page_mapcount() won't be mistaken + * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very + * efficiently by most CPU architectures. + */ +#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) + +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); + atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON_PAGE(!PageBuddy(page), page); + atomic_set(&page->_mapcount, -1); +} + +#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) + +static inline int PageBalloon(struct page *page) +{ + return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; +} + +static inline void __SetPageBalloon(struct page *page) +{ + VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); + atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); +} + +static inline void __ClearPageBalloon(struct page *page) +{ + VM_BUG_ON_PAGE(!PageBalloon(page), page); + atomic_set(&page->_mapcount, -1); +} + /* * If network-based swap is enabled, sl*b must keep track of whether pages * were allocated from pfmemalloc reserves. -- cgit v1.2.3 From 8d63d99a5dfbdb997d12dd3c07b2070ca723db3b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:13:12 -0700 Subject: mm: avoid tail page refcounting on non-THP compound pages THP uses tail page refcounting to be able to split huge pages at any time. Tail page refcounting is not needed for other users of compound pages and it's harmful because of overhead. We try to exclude non-THP pages from tail page refcounting using __compound_tail_refcounted() check. It excludes most common non-THP compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP users -- drivers. And it's not only about overhead. Drivers might want to use compound pages to get refcounting semantics suitable for mapping high-order pages to userspace. But tail page refcounting breaks it. Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on them. It means GUP pins would affect page_mapcount() for tail pages. It's not a problem for THP, because it never maps tail pages. But unlike THP, drivers map parts of compound pages with PTEs and it makes page_mapcount() be called for tail pages. In particular, GUP pins would shift PSS up and affect /proc/kpagecount for such pages. But, I'm not aware about anything which can lead to crash or other serious misbehaviour. Since currently all THP pages are anonymous and all drivers pages are not, we can fix the __compound_tail_refcounted() check by requiring PageAnon() to enable tail page refcounting. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fb1fc38b01ce..515ec5045b60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -496,7 +496,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* -- cgit v1.2.3 From b3b3a99c5371e2e96a2c680e6ac20218bddbd422 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:13:15 -0700 Subject: mm/migrate: check-before-clear PageSwapCache With the page flag sanitization patchset, an invalid usage of ClearPageSwapCache() is detected in migration_page_copy(). migrate_page_copy() is shared by both normal and hugepage (both thp and hugetlb) code path, so let's check PageSwapCache() and clear it if it's set to avoid misuse of the invalid clear operation. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index a65ff72ab739..f53838fe3dfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Please do not reorder this without considering how mm/ksm.c's * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). */ - ClearPageSwapCache(page); + if (PageSwapCache(page)) + ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); -- cgit v1.2.3 From a4bb3ecdc12a78dc4d0e690d40ec10887b640786 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:13:17 -0700 Subject: mm/page-writeback: check-before-clear PageReclaim With the page flag sanitization patchset, an invalid usage of ClearPageReclaim() is detected in set_page_dirty(). This can be called from __unmap_hugepage_range(), so let's check PageReclaim() before trying to clear it to avoid the misuse. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0372411f38fc..5daf5568b9e1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2228,7 +2228,8 @@ int set_page_dirty(struct page *page) * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ - ClearPageReclaim(page); + if (PageReclaim(page)) + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; -- cgit v1.2.3 From 5bbe3547aa3ba5242366a322a28996872301b703 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Wed, 15 Apr 2015 16:13:20 -0700 Subject: mm: allow compaction of unevictable pages Currently, pages which are marked as unevictable are protected from compaction, but not from other types of migration. The POSIX real time extension explicitly states that mlock() will prevent a major page fault, but the spirit of this is that mlock() should give a process the ability to control sources of latency, including minor page faults. However, the mlock manpage only explicitly says that a locked page will not be written to swap and this can cause some confusion. The compaction code today does not give a developer who wants to avoid swap but wants to have large contiguous areas available any method to achieve this state. This patch introduces a sysctl for controlling compaction behavior with respect to the unevictable lru. Users who demand no page faults after a page is present can set compact_unevictable_allowed to 0 and users who need the large contiguous areas can enable compaction on locked memory by leaving the default value of 1. To illustrate this problem I wrote a quick test program that mmaps a large number of 1MB files filled with random data. These maps are created locked and read only. Then every other mmap is unmapped and I attempt to allocate huge pages to the static huge page pool. When the compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages after fragmenting memory. When the value is set to 1, allocations succeed. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: David Rientjes Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 11 +++++++++++ include/linux/compaction.h | 1 + kernel/sysctl.c | 9 +++++++++ mm/compaction.c | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 902b4574acfb..9832ec52f859 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes - block_dump - compact_memory +- compact_unevictable_allowed - dirty_background_bytes - dirty_background_ratio - dirty_bytes @@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required. ============================================================== +compact_unevictable_allowed + +Available only when CONFIG_COMPACTION is set. When set to 1, compaction is +allowed to examine the unevictable lru (mlocked pages) for pages to compact. +This should be used on systems where stalls for minor page faults are an +acceptable trade for large contiguous free memory. Set to 0 to prevent +compaction from moving pages that are unevictable. Default value is 1. + +============================================================== + dirty_background_bytes Contains the amount of dirty memory at which the background kernel diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a014559e4a49..aa8f61cf3a19 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write, extern int sysctl_extfrag_threshold; extern int sysctl_extfrag_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); +extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c0eabd41886..42b7fc2860c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = { .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_COMPACTION */ { diff --git a/mm/compaction.c b/mm/compaction.c index a18201a8124e..570426edcadf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1046,6 +1046,12 @@ typedef enum { ISOLATE_SUCCESS, /* Pages isolated, migrate */ } isolate_migrate_t; +/* + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ +int sysctl_compact_unevictable_allowed __read_mostly = 1; + /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within @@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long low_pfn, end_pfn; struct page *page; const isolate_mode_t isolate_mode = + (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* -- cgit v1.2.3 From 922c0551a795dccadeb1dadc756d93fe3e303180 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Wed, 15 Apr 2015 16:13:23 -0700 Subject: Documentation/vm/unevictable-lru.txt: document interaction between compaction and the unevictable LRU The memory compaction code uses the migration code to do most of the work in compaction. However, the compaction code interacts with the unevictable LRU differently than migration code and this difference should be noted in the documentation. [akpm@linux-foundation.org: identify /proc/sys/vm/compact_unevictable directly] Signed-off-by: Eric B Munson Cc: Michal Hocko Cc: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Rik van Riel Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/unevictable-lru.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 86cb4624fc5a..3be0bfc4738d 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -22,6 +22,7 @@ CONTENTS - Filtering special vmas. - munlock()/munlockall() system call handling. - Migrating mlocked pages. + - Compacting mlocked pages. - mmap(MAP_LOCKED) system call handling. - munmap()/exit()/exec() system call handling. - try_to_unmap(). @@ -450,6 +451,17 @@ list because of a race between munlock and migration, page migration uses the putback_lru_page() function to add migrated pages back to the LRU. +COMPACTING MLOCKED PAGES +------------------------ + +The unevictable LRU can be scanned for compactable regions and the default +behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls +this behavior (see Documentation/sysctl/vm.txt). Once scanning of the +unevictable LRU is enabled, the work of compaction is mostly handled by +the page migration code and the same work flow as described in MIGRATING +MLOCKED PAGES will apply. + + mmap(MAP_LOCKED) SYSTEM CALL HANDLING ------------------------------------- -- cgit v1.2.3 From cc5993bd7b8cff4a3e37042ee1358d1d5eafa70c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:13:26 -0700 Subject: mm: rename deactivate_page to deactivate_file_page "deactivate_page" was created for file invalidation so it has too specific logic for file-backed pages. So, let's change the name of the function and date to a file-specific one and yield the generic name. Signed-off-by: Minchan Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Shaohua Li Cc: Wang, Yalin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/swap.c | 24 ++++++++++++------------ mm/truncate.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7067eca501e2..cee108cbe2d5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void deactivate_file_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/mm/swap.c b/mm/swap.c index cd3a5e64cea9..e3a4feac9b0e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -42,7 +42,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -743,7 +743,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { int lru, file; @@ -811,36 +811,36 @@ void lru_add_drain_cpu(int cpu) local_irq_restore(flags); } - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); activate_page_drain(cpu); } /** - * deactivate_page - forcefully deactivate a page + * deactivate_file_page - forcefully deactivate a file page * @page: page to deactivate * * This function hints the VM that @page is a good reclaim candidate, * for example if its invalidation fails due to the page being dirty * or under writeback. */ -void deactivate_page(struct page *page) +void deactivate_file_page(struct page *page) { /* - * In a workload with many unevictable page such as mprotect, unevictable - * page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable page such as mprotect, + * unevictable page deactivation for accelerating reclaim is pointless. */ if (PageUnevictable(page)) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); if (!pagevec_add(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + put_cpu_var(lru_deactivate_file_pvecs); } } @@ -872,7 +872,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); diff --git a/mm/truncate.c b/mm/truncate.c index 7a9d8a3cb143..66af9031fae8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * of interest and try to speed up its reclaim. */ if (!ret) - deactivate_page(page); + deactivate_file_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); -- cgit v1.2.3 From 3b3636924dfe1e80f9bef2c0dc1207e16f3b078a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Apr 2015 16:13:29 -0700 Subject: mm, memcg: sync allocation and memcg charge gfp flags for THP memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP charges. THP allocations, however, might be using different flags depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag and the current allocation context. The primary difference is that defrag configured to "madvise" value will clear __GFP_WAIT flag from the core gfp mask to make the allocation lighter for all mappings which are not backed by VM_HUGEPAGE vmas. If memcg charge path ignores this fact we will get light allocation but the a potential memcg reclaim would kill the whole point of the configuration. Fix the mismatch by providing the same gfp mask used for the allocation to the charge functions. This is quite easy for all paths except for hugepaged kernel thread with !CONFIG_NUMA which is doing a pre-allocation long before the allocated page is used in collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering the whole code path from khugepaged_do_scan we simply return the current flags as per khugepaged_defrag() value which might have changed since the preallocation. If somebody changed the value of the knob we would charge differently but this shouldn't happen often and it is definitely not critical because it would only lead to a reduced success rate of one-off THP promotion. [akpm@linux-foundation.org: fix weird code layout while we're there] [rientjes@google.com: clean up around alloc_hugepage_gfpmask()] Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3afb5cbe1312..4914e1b29fdb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page) + struct page *page, gfp_t gfp) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) return VM_FAULT_OOM; pgtable = pte_alloc_one(mm, haddr); @@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ ptl = pmd_lockptr(mm, pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - gfp_t gfp; - - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); - new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1130,8 +1129,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2323,19 +2321,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { - gfp_t flags; - VM_BUG_ON_PAGE(*hpage, *hpage); - /* Only allocate from the target node */ - flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_sem read lock. * The allocation can take potentially a long time if it involves @@ -2344,7 +2336,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2397,13 +2389,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); + return *hpage; } #endif @@ -2438,16 +2431,21 @@ static void collapse_huge_page(struct mm_struct *mm, struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); if (!new_page) return; if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) + gfp, &memcg))) return; /* -- cgit v1.2.3 From 195b0c60809ce841e5818b365808e7da3286fd3c Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Wed, 15 Apr 2015 16:13:33 -0700 Subject: mm/compaction: reset compaction scanner positions When the compaction is activated via /proc/sys/vm/compact_memory it would better scan the whole zone. And some platforms, for instance ARM, have the start_pfn of a zone at zero. Therefore the first try to compact via /proc doesn't work. It needs to reset the compaction scanner position first. Signed-off-by: Gioh Kim Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 570426edcadf..e6c4f9475d43 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1605,6 +1605,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); + /* + * When called via /proc/sys/vm/compact_memory + * this makes sure we compact the whole zone regardless of + * cached scanner positions. + */ + if (cc->order == -1) + __reset_isolation_suitable(zone); + if (cc->order == -1 || !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); -- cgit v1.2.3 From c6a918200c4f4ebf74b7e1ae4fea9115c7b297f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:36 -0700 Subject: hugetlbfs: add minimum size tracking fields to subpool structure hugetlbfs allocates huge pages from the global pool as needed. Even if the global pool contains a sufficient number pages for the filesystem size at mount time, those global pages could be grabbed for some other use. As a result, filesystem huge page allocations may fail due to lack of pages. Applications such as a database want to use huge pages for performance reasons. hugetlbfs filesystem semantics with ownership and modes work well to manage access to a pool of huge pages. However, the application would like some reasonable assurance that allocations will not fail due to a lack of huge pages. At application startup time, the application would like to configure itself to use a specific number of huge pages. Before starting, the application can check to make sure that enough huge pages exist in the system global pools. However, there are no guarantees that those pages will be available when needed by the application. What the application wants is exclusive use of a subset of huge pages. Add a new hugetlbfs mount option 'min_size=' to indicate that the specified number of pages will be available for use by the filesystem. At mount time, this number of huge pages will be reserved for exclusive use of the filesystem. If there is not a sufficient number of free pages, the mount will fail. As pages are allocated to and freeed from the filesystem, the number of reserved pages is adjusted so that the specified minimum is maintained. This patch (of 4): Add a field to the subpool structure to indicate the minimimum number of huge pages to always be used by this subpool. This minimum count includes allocated pages as well as reserved pages. If the minimum number of pages for the subpool have not been allocated, pages are reserved up to this minimum. An additional field (rsv_hpages) is used to track the number of pages reserved to meet this minimum size. The hstate pointer in the subpool is convenient to have when reserving and unreserving the pages. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 +++++++- mm/hugetlb.c | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1a782733a420..d1a77b87408d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -22,7 +22,13 @@ struct mmu_gather; struct hugepage_subpool { spinlock_t lock; long count; - long max_hpages, used_hpages; + long max_hpages; /* Maximum huge pages or -1 if no maximum. */ + long used_hpages; /* Used count against maximum, includes */ + /* both alloced and reserved pages. */ + struct hstate *hstate; + long min_hpages; /* Minimum huge pages or -1 if no minimum. */ + long rsv_hpages; /* Pages reserved against global pool to */ + /* sasitfy minimum size. */ }; struct resv_map { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..4494976c2042 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -77,14 +77,13 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) { struct hugepage_subpool *spool; - spool = kmalloc(sizeof(*spool), GFP_KERNEL); + spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = nr_blocks; - spool->used_hpages = 0; return spool; } -- cgit v1.2.3 From 1c5ecae3a93fa1ab51a784d77e9c9ed54e67c65f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:39 -0700 Subject: hugetlbfs: add minimum size accounting to subpools The same routines that perform subpool maximum size accounting hugepage_subpool_get/put_pages() are modified to also perform minimum size accounting. When a delta value is passed to these routines, calculate how global reservations must be adjusted to maintain the subpool minimum size. The routines now return this global reserve count adjustment. This global reserve count adjustment is then passed to the global accounting routine hugetlb_acct_memory(). Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 23 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4494976c2042..499cb72c74b1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -96,36 +96,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) unlock_or_release_subpool(spool); } -static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * the request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be manitained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { - int ret = 0; + long ret = delta; if (!spool) - return 0; + return ret; spin_lock(&spool->lock); - if ((spool->used_hpages + delta) <= spool->max_hpages) { - spool->used_hpages += delta; - } else { - ret = -ENOMEM; + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } } - spin_unlock(&spool->lock); + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } + } + +unlock_ret: + spin_unlock(&spool->lock); return ret; } -static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { + long ret = delta; + if (!spool) - return; + return delta; spin_lock(&spool->lock); - spool->used_hpages -= delta; - /* If hugetlbfs_put_super couldn't free spool due to - * an outstanding quota reference, free it now. */ + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ unlock_or_release_subpool(spool); + + return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) @@ -873,6 +926,14 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page); + /* + * A return code of zero implies that the subpool will be under its + * minimum size if the reservation is not restored after page is free. + * Therefore, force restore_reserve operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -890,7 +951,6 @@ void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -1385,7 +1445,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (chg < 0) return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1)) + if (hugepage_subpool_get_pages(spool, 1) < 0) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); @@ -2453,6 +2513,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; + long gbl_reserve; if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -2465,8 +2526,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&resv->refs, resv_map_release); if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); } } @@ -3446,6 +3511,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; + long gbl_reserve; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3482,8 +3548,13 @@ int hugetlb_reserve_pages(struct inode *inode, goto out_err; } - /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) { + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; } @@ -3492,9 +3563,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, chg); + ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - hugepage_subpool_put_pages(spool, chg); + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); goto out_err; } @@ -3524,6 +3596,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; if (resv_map) chg = region_truncate(resv_map, offset); @@ -3531,8 +3604,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -(chg - freed)); + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE -- cgit v1.2.3 From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:42 -0700 Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly Make 'min_size=' be an option when mounting a hugetlbfs. This option takes the same value as the 'size' option. min_size can be specified without specifying size. If both are specified, min_size must be less that or equal to size else the mount will fail. If min_size is specified, then at mount time an attempt is made to reserve min_size pages. If the reservation fails, the mount fails. At umount time, the reserved pages are released. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 90 ++++++++++++++++++++++++++++++++++++++----------- include/linux/hugetlb.h | 3 +- mm/hugetlb.c | 25 +++++++++++--- 3 files changed, 94 insertions(+), 24 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index db76cec3ce21..3a8f12762821 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -47,9 +47,10 @@ struct hugetlbfs_config { kuid_t uid; kgid_t gid; umode_t mode; - long nr_blocks; + long max_hpages; long nr_inodes; struct hstate *hstate; + long min_hpages; }; struct hugetlbfs_inode_info { @@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, + Opt_pagesize, Opt_min_size, Opt_err, }; @@ -78,6 +79,7 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, {Opt_err, NULL}, }; @@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = { .show_options = generic_show_options, }; +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; - unsigned long long size = 0; - enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; if (!options) return 0; @@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; - size = memparse(args[0].from, &rest); - setsize = SIZE_STD; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; if (*rest == '%') - setsize = SIZE_PERCENT; + max_val_type = SIZE_PERCENT; break; } @@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + default: pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; @@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } } - /* Do size after hstate is set up */ - if (setsize > NO_SIZE) { - struct hstate *h = pconfig->hstate; - if (setsize == SIZE_PERCENT) { - size <<= huge_page_shift(h); - size *= h->max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> huge_page_shift(h)); + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; } return 0; @@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - config.nr_blocks = -1; /* No limit on size by default */ + config.max_hpages = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current_fsuid(); config.gid = current_fsgid(); config.mode = 0755; config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sbinfo->spool = NULL; - if (config.nr_blocks != -1) { - sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); if (!sbinfo->spool) goto out_free; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1a77b87408d..5713c49a4a5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -44,7 +44,8 @@ extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 499cb72c74b1..995c8d65a95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -68,12 +71,18 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, free the subpool the subpool remain */ - if (free) + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); kfree(spool); + } } -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) { struct hugepage_subpool *spool; @@ -83,7 +92,15 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) spin_lock_init(&spool->lock); spool->count = 1; - spool->max_hpages = nr_blocks; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; return spool; } -- cgit v1.2.3 From 8c9b97033547834404a58ea88da7226ed5167726 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:45 -0700 Subject: hugetlbfs: document min_size mount option and cleanup Add min_size mount option to the hugetlbfs documentation. Also, add the missing pagesize option and mention that size can be specified as bytes or a percentage of huge page pool. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index f2d3a100fe38..b32b9cd50c0e 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -267,21 +267,34 @@ call, then it is required that system administrator mount a file system of type hugetlbfs: mount -t hugetlbfs \ - -o uid=,gid=,mode=,size=,nr_inodes= \ - none /mnt/huge + -o uid=,gid=,mode=,pagesize=,size=,\ + min_size=,nr_inodes= none /mnt/huge This command mounts a (pseudo) filesystem of type hugetlbfs on the directory /mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid options sets the owner and group of the root of the file system. By default the uid and gid of the current process are taken. The mode option sets the mode of root of file system to value & 01777. This value is given in octal. -By default the value 0755 is picked. The size option sets the maximum value of -memory (huge pages) allowed for that filesystem (/mnt/huge). The size is -rounded down to HPAGE_SIZE. The option nr_inodes sets the maximum number of -inodes that /mnt/huge can use. If the size or nr_inodes option is not -provided on command line then no limits are set. For size and nr_inodes -options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For -example, size=2K has the same meaning as size=2048. +By default the value 0755 is picked. If the paltform supports multiple huge +page sizes, the pagesize option can be used to specify the huge page size and +associated pool. pagesize is specified in bytes. If pagesize is not specified +the paltform's default huge page size and associated pool will be used. The +size option sets the maximum value of memory (huge pages) allowed for that +filesystem (/mnt/huge). The size option can be specified in bytes, or as a +percentage of the specified huge page pool (nr_hugepages). The size is +rounded down to HPAGE_SIZE boundary. The min_size option sets the minimum +value of memory (huge pages) allowed for the filesystem. min_size can be +specified in the same way as size, either bytes or a percentage of the +huge page pool. At mount time, the number of huge pages specified by +min_size are reserved for use by the filesystem. If there are not enough +free huge pages available, the mount will fail. As huge pages are allocated +to the filesystem and freed, the reserve count is adjusted so that the sum +of allocated and reserved huge pages is always at least min_size. The option +nr_inodes sets the maximum number of inodes that /mnt/huge can use. If the +size, min_size or nr_inodes option is not provided on command line then +no limits are set. For pagesize, size, min_size and nr_inodes options, you +can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For example, size=2K +has the same meaning as size=2048. While read system calls are supported on files that reside on hugetlb file systems, write system calls are not. -- cgit v1.2.3 From 68ac546f265ba36cd4f29c77b3841fb777315581 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Wed, 15 Apr 2015 16:13:48 -0700 Subject: mm/vmalloc: fix possible exhaustion of vmalloc space caused by vm_map_ram allocator Recently I came across high fragmentation of vm_map_ram allocator: vmap_block has free space, but still new blocks continue to appear. Further investigation showed that certain mapping/unmapping sequences can exhaust vmalloc space. On small 32bit systems that's not a big problem, cause purging will be called soon on a first allocation failure (alloc_vmap_area), but on 64bit machines, e.g. x86_64 has 45 bits of vmalloc space, that can be a disaster. 1) I came up with a simple allocation sequence, which exhausts virtual space very quickly: while (iters) { /* Map/unmap big chunk */ vaddr = vm_map_ram(pages, 16, -1, PAGE_KERNEL); vm_unmap_ram(vaddr, 16); /* Map/unmap small chunks. * * -1 for hole, which should be left at the end of each block * to keep it partially used, with some free space available */ for (i = 0; i < (VMAP_BBMAP_BITS - 16) / 8 - 1; i++) { vaddr = vm_map_ram(pages, 8, -1, PAGE_KERNEL); vm_unmap_ram(vaddr, 8); } } The idea behind is simple: 1. We have to map a big chunk, e.g. 16 pages. 2. Then we have to occupy the remaining space with smaller chunks, i.e. 8 pages. At the end small hole should remain to keep block in free list, but do not let big chunk to occupy remaining space. 3. Goto 1 - allocation request of 16 pages can't be completed (only 8 slots are left free in the block in the #2 step), new block will be allocated, all further requests will lay into newly allocated block. To have some measurement numbers for all further tests I setup ftrace and enabled 4 basic calls in a function profile: echo vm_map_ram > /sys/kernel/debug/tracing/set_ftrace_filter; echo alloc_vmap_area >> /sys/kernel/debug/tracing/set_ftrace_filter; echo vm_unmap_ram >> /sys/kernel/debug/tracing/set_ftrace_filter; echo free_vmap_block >> /sys/kernel/debug/tracing/set_ftrace_filter; So for this scenario I got these results: BEFORE (all new blocks are put to the head of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 126000 30683.30 us 0.243 us 30819.36 us vm_unmap_ram 126000 22003.24 us 0.174 us 340.886 us alloc_vmap_area 1000 4132.065 us 4.132 us 0.903 us AFTER (all new blocks are put to the tail of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 126000 28713.13 us 0.227 us 24944.70 us vm_unmap_ram 126000 20403.96 us 0.161 us 1429.872 us alloc_vmap_area 993 3916.795 us 3.944 us 29.370 us free_vmap_block 992 654.157 us 0.659 us 1.273 us SUMMARY: The most interesting numbers in those tables are numbers of block allocations and deallocations: alloc_vmap_area and free_vmap_block calls, which show that before the change blocks were not freed, and virtual space and physical memory (vmap_block structure allocations, etc) were consumed. Average time which were spent in vm_map_ram/vm_unmap_ram became slightly better. That can be explained with a reasonable amount of blocks in a free list, which we need to iterate to find a suitable free block. 2) Another scenario is a random allocation: while (iters) { /* Randomly take number from a range [1..32/64] */ nr = rand(1, VMAP_MAX_ALLOC); vaddr = vm_map_ram(pages, nr, -1, PAGE_KERNEL); vm_unmap_ram(vaddr, nr); } I chose mersenne twister PRNG to generate persistent random state to guarantee that both runs have the same random sequence. For each vm_map_ram call random number from [1..32/64] was taken to represent amount of pages which I do map. I did 10'000 vm_map_ram calls and got these two tables: BEFORE (all new blocks are put to the head of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 10000 10170.01 us 1.017 us 993.609 us vm_unmap_ram 10000 5321.823 us 0.532 us 59.789 us alloc_vmap_area 420 2150.239 us 5.119 us 3.307 us free_vmap_block 37 159.587 us 4.313 us 134.344 us AFTER (all new blocks are put to the tail of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 10000 7745.637 us 0.774 us 395.229 us vm_unmap_ram 10000 5460.573 us 0.546 us 67.187 us alloc_vmap_area 414 2201.650 us 5.317 us 5.591 us free_vmap_block 412 574.421 us 1.394 us 15.138 us SUMMARY: 'BEFORE' table shows, that 420 blocks were allocated and only 37 were freed. Remained 383 blocks are still in a free list, consuming virtual space and physical memory. 'AFTER' table shows, that 414 blocks were allocated and 412 were really freed. 2 blocks remained in a free list. So fragmentation was dramatically reduced. Why? Because when we put newly allocated block to the head, all further requests will occupy new block, regardless remained space in other blocks. In this scenario all requests come randomly. Eventually remained free space will be less than requested size, free list will be iterated and it is possible that nothing will be found there - finally new block will be created. So exhaustion in random scenario happens for the maximum possible allocation size: 32 pages for 32-bit system and 64 pages for 64-bit system. Also average cost of vm_map_ram was reduced from 1.017 us to 0.774 us. Again this can be explained by iteration through smaller list of free blocks. 3) Next simple scenario is a sequential allocation, when the allocation order is increased for each block. This scenario forces allocator to reach maximum amount of partially free blocks in a free list: while (iters) { /* Populate free list with blocks with remaining space */ for (order = 0; order <= ilog2(VMAP_MAX_ALLOC); order++) { nr = VMAP_BBMAP_BITS / (1 << order); /* Leave a hole */ nr -= 1; for (i = 0; i < nr; i++) { vaddr = vm_map_ram(pages, (1 << order), -1, PAGE_KERNEL); vm_unmap_ram(vaddr, (1 << order)); } /* Completely occupy blocks from a free list */ for (order = 0; order <= ilog2(VMAP_MAX_ALLOC); order++) { vaddr = vm_map_ram(pages, (1 << order), -1, PAGE_KERNEL); vm_unmap_ram(vaddr, (1 << order)); } } Results which I got: BEFORE (all new blocks are put to the head of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 2032000 399545.2 us 0.196 us 467123.7 us vm_unmap_ram 2032000 363225.7 us 0.178 us 111405.9 us alloc_vmap_area 7001 30627.76 us 4.374 us 495.755 us free_vmap_block 6993 7011.685 us 1.002 us 159.090 us AFTER (all new blocks are put to the tail of a free list) # cat /sys/kernel/debug/tracing/trace_stat/function0 Function Hit Time Avg s^2 -------- --- ---- --- --- vm_map_ram 2032000 394259.7 us 0.194 us 589395.9 us vm_unmap_ram 2032000 292500.7 us 0.143 us 94181.08 us alloc_vmap_area 7000 31103.11 us 4.443 us 703.225 us free_vmap_block 7000 6750.844 us 0.964 us 119.112 us SUMMARY: No surprises here, almost all numbers are the same. Fixing this fragmentation problem I also did some improvements in a allocation logic of a new vmap block: occupy block immediately and get rid of extra search in a free list. Also I replaced dirty bitmap with min/max dirty range values to make the logic simpler and slightly faster, since two longs comparison costs less, than loop thru bitmap. This patchset raises several questions: Q: Think the problem you comments is already known so that I wrote comments about it as "it could consume lots of address space through fragmentation". Could you tell me about your situation and reason why it should be avoided? Gioh Kim A: Indeed, there was a commit 364376383 which adds explicit comment about fragmentation. But fragmentation which is described in this comment caused by mixing of long-lived and short-lived objects, when a whole block is pinned in memory because some page slots are still in use. But here I am talking about blocks which are free, nobody uses them, and allocator keeps them alive forever, continuously allocating new blocks. Q: I think that if you put newly allocated block to the tail of a free list, below example would results in enormous performance degradation. new block: 1MB (256 pages) while (iters--) { vm_map_ram(3 or something else not dividable for 256) * 85 vm_unmap_ram(3) * 85 } On every iteration, it needs newly allocated block and it is put to the tail of a free list so finding it consumes large amount of time. Joonsoo Kim A: Second patch in current patchset gets rid of extra search in a free list, so new block will be immediately occupied.. Also, the scenario above is impossible, cause vm_map_ram allocates virtual range in orders, i.e. 2^n. I.e. passing 3 to vm_map_ram you will allocate 4 slots in a block and 256 slots (capacity of a block) of course dividable on 4, so block will be completely occupied. But there is a worst case which we can achieve: each free block has a hole equal to order size. The maximum size of allocation is 64 pages for 64-bit system (if you try to map more, original alloc_vmap_area will be called). So the maximum order is 6. That means that worst case, before allocator makes a decision to allocate a new block, is to iterate 7 blocks: HEAD 1st block - has 1 page slot free (order 0) 2nd block - has 2 page slots free (order 1) 3rd block - has 4 page slots free (order 2) 4th block - has 8 page slots free (order 3) 5th block - has 16 page slots free (order 4) 6th block - has 32 page slots free (order 5) 7th block - has 64 page slots free (order 6) TAIL So the worst scenario on 64-bit system is that each CPU queue can have 7 blocks in a free list. This can happen only and only if you allocate blocks increasing the order. (as I did in the function written in the comment of the first patch) This is weird and rare case, but still it is possible. Afterwards you will get 7 blocks in a list. All further requests should be placed in a newly allocated block or some free slots should be found in a free list. Seems it does not look dramatically awful. This patch (of 3): If suitable block can't be found, new block is allocated and put into a head of a free list, so on next iteration this new block will be found first. That's bad, because old blocks in a free list will not get a chance to be fully used, thus fragmentation will grow. Let's consider this simple example: #1 We have one block in a free list which is partially used, and where only one page is free: HEAD |xxxxxxxxx-| TAIL ^ free space for 1 page, order 0 #2 New allocation request of order 1 (2 pages) comes, new block is allocated since we do not have free space to complete this request. New block is put into a head of a free list: HEAD |----------|xxxxxxxxx-| TAIL #3 Two pages were occupied in a new found block: HEAD |xx--------|xxxxxxxxx-| TAIL ^ two pages mapped here #4 New allocation request of order 0 (1 page) comes. Block, which was created on #2 step, is located at the beginning of a free list, so it will be found first: HEAD |xxX-------|xxxxxxxxx-| TAIL ^ ^ page mapped here, but better to use this hole It is obvious, that it is better to complete request of #4 step using the old block, where free space is left, because in other case fragmentation will be highly increased. But fragmentation is not only the case. The worst thing is that I can easily create scenario, when the whole vmalloc space is exhausted by blocks, which are not used, but already dirty and have several free pages. Let's consider this function which execution should be pinned to one CPU: static void exhaust_virtual_space(struct page *pages[16], int iters) { /* Firstly we have to map a big chunk, e.g. 16 pages. * Then we have to occupy the remaining space with smaller * chunks, i.e. 8 pages. At the end small hole should remain. * So at the end of our allocation sequence block looks like * this: * XX big chunk * |XXxxxxxxx-| x small chunk * - hole, which is enough for a small chunk, * but is not enough for a big chunk */ while (iters--) { int i; void *vaddr; /* Map/unmap big chunk */ vaddr = vm_map_ram(pages, 16, -1, PAGE_KERNEL); vm_unmap_ram(vaddr, 16); /* Map/unmap small chunks. * * -1 for hole, which should be left at the end of each block * to keep it partially used, with some free space available */ for (i = 0; i < (VMAP_BBMAP_BITS - 16) / 8 - 1; i++) { vaddr = vm_map_ram(pages, 8, -1, PAGE_KERNEL); vm_unmap_ram(vaddr, 8); } } } On every iteration new block (1MB of vm area in my case) will be allocated and then will be occupied, without attempt to resolve small allocation request using previously allocated blocks in a free list. In case of random allocation (size should be randomly taken from the range [1..64] in 64-bit case or [1..32] in 32-bit case) situation is the same: new blocks continue to appear if maximum possible allocation size (32 or 64) passed to the allocator, because all remaining blocks in a free list do not have enough free space to complete this allocation request. In summary if new blocks are put into the head of a free list eventually virtual space will be exhausted. In current patch I simply put newly allocated block to the tail of a free list, thus reduce fragmentation, giving a chance to resolve allocation request using older blocks with possible holes left. Signed-off-by: Roman Pen Cc: Eric Dumazet Acked-by: Joonsoo Kim Cc: David Rientjes Cc: WANG Chao Cc: Fabian Frederick Cc: Christoph Lameter Cc: Gioh Kim Cc: Rob Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5bbdd3b5d67..84feb5249b12 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -842,7 +842,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); spin_lock(&vbq->lock); - list_add_rcu(&vb->free_list, &vbq->free); + list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); -- cgit v1.2.3 From cf725ce274ba026e132c225cb8e5b61973c63403 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Wed, 15 Apr 2015 16:13:52 -0700 Subject: mm/vmalloc: occupy newly allocated vmap block just after allocation Previous implementation allocates new vmap block and repeats search of a free block from the very beginning, iterating over the CPU free list. Why it can be better?? 1. Allocation can happen on one CPU, but search can be done on another CPU. In worst case we preallocate amount of vmap blocks which is equal to CPU number on the system. 2. In previous patch I added newly allocated block to the tail of free list to avoid soon exhaustion of virtual space and give a chance to occupy blocks which were allocated long time ago. Thus to find newly allocated block all the search sequence should be repeated, seems it is not efficient. In this patch newly allocated block is occupied right away, address of virtual space is returned to the caller, so there is no any need to repeat the search sequence, allocation job is done. Signed-off-by: Roman Pen Cc: Andrew Morton Cc: Eric Dumazet Acked-by: Joonsoo Kim Cc: David Rientjes Cc: WANG Chao Cc: Fabian Frederick Cc: Christoph Lameter Cc: Gioh Kim Cc: Rob Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 58 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 84feb5249b12..21ec16b7e6e1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -796,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr) return addr; } -static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) +{ + unsigned long addr; + + addr = va_start + (pages_off << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); + return (void *)addr; +} + +/** + * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this + * block. Of course pages number can't exceed VMAP_BBMAP_BITS + * @order: how many 2^order pages should be occupied in newly allocated block + * @gfp_mask: flags for the page level allocator + * + * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + */ +static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; unsigned long vb_idx; int node, err; + void *vaddr; node = numa_node_id(); @@ -826,9 +844,12 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) return ERR_PTR(err); } + vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; - vb->free = VMAP_BBMAP_BITS; + /* At least something should be left free */ + BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -846,7 +867,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); - return vb; + return vaddr; } static void free_vmap_block(struct vmap_block *vb) @@ -910,7 +931,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; - unsigned long addr = 0; + void *vaddr = NULL; unsigned int order; BUG_ON(size & ~PAGE_MASK); @@ -925,43 +946,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) } order = get_order(size); -again: rcu_read_lock(); vbq = &get_cpu_var(vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i; + unsigned long pages_off; spin_lock(&vb->lock); - if (vb->free < 1UL << order) - goto next; + if (vb->free < (1UL << order)) { + spin_unlock(&vb->lock); + continue; + } - i = VMAP_BBMAP_BITS - vb->free; - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); + pages_off = VMAP_BBMAP_BITS - vb->free; + vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } + spin_unlock(&vb->lock); break; -next: - spin_unlock(&vb->lock); } put_cpu_var(vmap_block_queue); rcu_read_unlock(); - if (!addr) { - vb = new_vmap_block(gfp_mask); - if (IS_ERR(vb)) - return vb; - goto again; - } + /* Allocate new block if nothing was found */ + if (!vaddr) + vaddr = new_vmap_block(order, gfp_mask); - return (void *)addr; + return vaddr; } static void vb_free(const void *addr, unsigned long size) -- cgit v1.2.3 From 7d61bfe8fddecad76eb37cc477aab369c5c81ed3 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Wed, 15 Apr 2015 16:13:55 -0700 Subject: mm/vmalloc: get rid of dirty bitmap inside vmap_block structure In original implementation of vm_map_ram made by Nick Piggin there were two bitmaps: alloc_map and dirty_map. None of them were used as supposed to be: finding a suitable free hole for next allocation in block. vm_map_ram allocates space sequentially in block and on free call marks pages as dirty, so freed space can't be reused anymore. Actually it would be very interesting to know the real meaning of those bitmaps, maybe implementation was incomplete, etc. But long time ago Zhang Yanfei removed alloc_map by these two commits: mm/vmalloc.c: remove dead code in vb_alloc 3fcd76e8028e0be37b02a2002b4f56755daeda06 mm/vmalloc.c: remove alloc_map from vmap_block b8e748b6c32999f221ea4786557b8e7e6c4e4e7a In this patch I replaced dirty_map with two range variables: dirty min and max. These variables store minimum and maximum position of dirty space in a block, since we need only to know the dirty range, not exact position of dirty pages. Why it was made? Several reasons: at first glance it seems that vm_map_ram allocator concerns about fragmentation thus it uses bitmaps for finding free hole, but it is not true. To avoid complexity seems it is better to use something simple, like min or max range values. Secondly, code also becomes simpler, without iteration over bitmap, just comparing values in min and max macros. Thirdly, bitmap occupies up to 1024 bits (4MB is a max size of a block). Here I replaced the whole bitmap with two longs. Finally vm_unmap_aliases should be slightly faster and the whole vmap_block structure occupies less memory. Signed-off-by: Roman Pen Cc: Zhang Yanfei Cc: Eric Dumazet Acked-by: Joonsoo Kim Cc: David Rientjes Cc: WANG Chao Cc: Fabian Frederick Cc: Christoph Lameter Cc: Gioh Kim Cc: Rob Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 21ec16b7e6e1..2faaa2976447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -765,7 +765,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; - DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; @@ -851,7 +851,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; - bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + vb->dirty_min = VMAP_BBMAP_BITS; + vb->dirty_max = 0; INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -902,7 +903,8 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + vb->dirty_min = 0; + vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); @@ -995,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size) order = get_order(size); offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + offset >>= PAGE_SHIFT; vb_idx = addr_to_vb_idx((unsigned long)addr); rcu_read_lock(); @@ -1005,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); spin_lock(&vb->lock); - BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); + + /* Expand dirty range */ + vb->dirty_min = min(vb->dirty_min, offset); + vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { @@ -1044,25 +1050,18 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i, j; - spin_lock(&vb->lock); - i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); - if (i < VMAP_BBMAP_BITS) { + if (vb->dirty) { + unsigned long va_start = vb->va->va_start; unsigned long s, e; - j = find_last_bit(vb->dirty_map, - VMAP_BBMAP_BITS); - j = j + 1; /* need exclusive index */ + s = va_start + (vb->dirty_min << PAGE_SHIFT); + e = va_start + (vb->dirty_max << PAGE_SHIFT); - s = vb->va->va_start + (i << PAGE_SHIFT); - e = vb->va->va_start + (j << PAGE_SHIFT); - flush = 1; + start = min(s, start); + end = max(e, end); - if (s < start) - start = s; - if (e > end) - end = e; + flush = 1; } spin_unlock(&vb->lock); } -- cgit v1.2.3 From 12215182c80c06ab3c69969abba6e4a834493cd8 Mon Sep 17 00:00:00 2001 From: Derek Date: Wed, 15 Apr 2015 16:13:58 -0700 Subject: mremap should return -ENOMEM when __vm_enough_memory fail Recently I straced bash behavior in this dd zero pipe to read test, in part of testing under vm.overcommit_memory=2 (OVERCOMMIT_NEVER mode): # dd if=/dev/zero | read x The bash sub shell is calling mremap to reallocate more and more memory untill it finally failed -ENOMEM (I expect), or to be killed by system OOM killer (which should not happen under OVERCOMMIT_NEVER mode); But the mremap system call actually failed of -EFAULT, which is a surprise to me, I think it's supposed to be -ENOMEM? then I wrote this piece of C code testing confirmed it: https://gist.github.com/crquan/326bde37e1ddda8effe5 $ ./remap allocated one page @0x7f686bf71000, (PAGE_SIZE: 4096) grabbed 7680512000 bytes of memory (1875125 pages) @ 00007f6690993000. mremap failed Bad address (14). The -EFAULT comes from the branch of security_vm_enough_memory_mm failure, underlyingly it calls __vm_enough_memory which returns only 0 for success or -ENOMEM; So why vma_to_resize needs to return -EFAULT in this case? this sounds like a mistake to me. Some more digging into git history: 1) Before commit 119f657c7 ("RLIMIT_AS checking fix") in May 1 2005 (pre 2.6.12 days) it was returning -ENOMEM for this failure; 2) but commit 119f657c7 ("untangling do_mremap(), part 1") changed it accidentally, to what ever is preserved in local ret, which happened to be -EFAULT, in a previous assignment; 3) then in commit 54f5de709 code refactoring, it's explicitly returning -EFAULT, should be wrong. Signed-off-by: Derek Che Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 2dc44b1cb1df..1fb0f9069653 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -381,7 +381,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & VM_ACCOUNT) { unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) - goto Efault; + goto Enomem; *p = charged; } -- cgit v1.2.3 From 6cd576130b7152d8f225bab9d21a3f6f96c84b47 Mon Sep 17 00:00:00 2001 From: Derek Date: Wed, 15 Apr 2015 16:14:02 -0700 Subject: mm/mremap.c: clean up goto just return ERR_PTR As suggested by Kirill the "goto"s in vma_to_resize aren't necessary, just change them to explicit return. Signed-off-by: Derek Che Suggested-by: "Kirill A. Shutemov" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 1fb0f9069653..034e2d360652 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -345,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, struct vm_area_struct *vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) - goto Efault; + return ERR_PTR(-EFAULT); if (is_vm_hugetlb_page(vma)) - goto Einval; + return ERR_PTR(-EINVAL); /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) - goto Efault; + return ERR_PTR(-EFAULT); /* Need to be careful about a growing mapping */ if (new_len > old_len) { unsigned long pgoff; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - goto Efault; + return ERR_PTR(-EFAULT); pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - goto Einval; + return ERR_PTR(-EINVAL); } if (vma->vm_flags & VM_LOCKED) { @@ -372,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK); locked += new_len - old_len; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - goto Eagain; + return ERR_PTR(-EAGAIN); } if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) - goto Enomem; + return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) - goto Enomem; + return ERR_PTR(-ENOMEM); *p = charged; } return vma; - -Efault: /* very odd choice for most of the cases, but... */ - return ERR_PTR(-EFAULT); -Einval: - return ERR_PTR(-EINVAL); -Enomem: - return ERR_PTR(-ENOMEM); -Eagain: - return ERR_PTR(-EAGAIN); } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, -- cgit v1.2.3 From 9d8c47e4bb1c20dbceee437f9fa7d76dafee80a2 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 15 Apr 2015 16:14:05 -0700 Subject: mm: use READ_ONCE() for non-scalar types Commit 38c5ce936a08 ("mm/gup: Replace ACCESS_ONCE with READ_ONCE") converted ACCESS_ONCE usage in gup_pmd_range() to READ_ONCE, since ACCESS_ONCE doesn't work reliably on non-scalar types. This patch also fixes the other ACCESS_ONCE usages in gup_pte_range() and __get_user_pages_fast() in mm/gup.c Signed-off-by: Jason Low Acked-by: Michal Hocko Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index ca7b607ab671..6297f6bccfb1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1019,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * * for an example see gup_get_pte in arch/x86/mm/gup.c */ - pte_t pte = ACCESS_ONCE(*ptep); + pte_t pte = READ_ONCE(*ptep); struct page *page; /* @@ -1309,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { - pgd_t pgd = ACCESS_ONCE(*pgdp); + pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) -- cgit v1.2.3 From 4db0c3c2983cc6b7a08a33542af5e14de8a9258c Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 15 Apr 2015 16:14:08 -0700 Subject: mm: remove rest of ACCESS_ONCE() usages We converted some of the usages of ACCESS_ONCE to READ_ONCE in the mm/ tree since it doesn't work reliably on non-scalar types. This patch removes the rest of the usages of ACCESS_ONCE, and use the new READ_ONCE API for the read accesses. This makes things cleaner, instead of using separate/multiple sets of APIs. Signed-off-by: Jason Low Acked-by: Michal Hocko Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- mm/internal.h | 4 ++-- mm/ksm.c | 10 +++++----- mm/memcontrol.c | 18 +++++++++--------- mm/memory.c | 2 +- mm/mmap.c | 8 ++++---- mm/page_alloc.c | 6 +++--- mm/rmap.c | 6 +++--- mm/slub.c | 4 ++-- mm/swap_state.c | 2 +- mm/swapfile.c | 2 +- 11 files changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4914e1b29fdb..1db93fbda06a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -183,7 +183,7 @@ static struct page *get_huge_zero_page(void) struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); @@ -202,7 +202,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); } static void put_huge_zero_page(void) diff --git a/mm/internal.h b/mm/internal.h index edaab69a9c35..a25e359a4039 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -224,13 +224,13 @@ static inline unsigned long page_order(struct page *page) * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * - * ACCESS_ONCE is used so that if the caller assigns the result into a local + * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ -#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) +#define page_order_unsafe(page) READ_ONCE(page_private(page)) static inline bool is_cow_mapping(vm_flags_t flags) { diff --git a/mm/ksm.c b/mm/ksm.c index 4162dce2eb44..7ee101eaacdf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) expected_mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); again: - kpfn = ACCESS_ONCE(stable_node->kpfn); + kpfn = READ_ONCE(stable_node->kpfn); page = pfn_to_page(kpfn); /* @@ -551,7 +551,7 @@ again: * but on Alpha we need to be more careful. */ smp_read_barrier_depends(); - if (ACCESS_ONCE(page->mapping) != expected_mapping) + if (READ_ONCE(page->mapping) != expected_mapping) goto stale; /* @@ -577,14 +577,14 @@ again: cpu_relax(); } - if (ACCESS_ONCE(page->mapping) != expected_mapping) { + if (READ_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } if (lock_it) { lock_page(page); - if (ACCESS_ONCE(page->mapping) != expected_mapping) { + if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); goto stale; @@ -600,7 +600,7 @@ stale: * before checking whether node->kpfn has been changed. */ smp_rmb(); - if (ACCESS_ONCE(stable_node->kpfn) != kpfn) + if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 74a9641d8f9f..14c2f2017e37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -674,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; if (nr_pages > soft_limit) @@ -1042,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; do { - pos = ACCESS_ONCE(iter->position); + pos = READ_ONCE(iter->position); /* * A racing update may change the position and * put the last reference, hence css_tryget(), @@ -1359,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = ACCESS_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.limit); if (count < limit) margin = limit - count; if (do_swap_account) { count = page_counter_read(&memcg->memsw); - limit = ACCESS_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) margin = min(margin, limit - count); } @@ -2637,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -5007,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ - move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); if (move_flags) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -5241,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = ACCESS_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5271,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = ACCESS_ONCE(memcg->high); + unsigned long high = READ_ONCE(memcg->high); if (high == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5301,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = ACCESS_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.limit); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); diff --git a/mm/memory.c b/mm/memory.c index ac20b2a6a0c3..656593f73c8e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2845,7 +2845,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, struct vm_fault vmf; int off; - nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; start_addr = max(address & mask, vma->vm_start); diff --git a/mm/mmap.c b/mm/mmap.c index 06a6076c92e5..e65cbe0d64fc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But - * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. + * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid @@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { - struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; @@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns actual_size = size; if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) actual_size -= PAGE_SIZE; - if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b849500640c..ebffa0e4a9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1371,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) int to_drain, batch; local_irq_save(flags); - batch = ACCESS_ONCE(pcp->batch); + batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); @@ -1570,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - unsigned long batch = ACCESS_ONCE(pcp->batch); + unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } @@ -6207,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, mask <<= (BITS_PER_LONG - bitidx - 1); flags <<= (BITS_PER_LONG - bitidx - 1); - word = ACCESS_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(bitmap[word_bitidx]); for (;;) { old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); if (word == old_word) diff --git a/mm/rmap.c b/mm/rmap.c index c161a14b6a8f..24dd3f9fee27 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) @@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - root_anon_vma = ACCESS_ONCE(anon_vma->root); + root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still diff --git a/mm/slub.c b/mm/slub.c index 0fdd6c1e1f82..54c0876b43d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int node; struct page *page; - page = ACCESS_ONCE(c->page); + page = READ_ONCE(c->page); if (!page) continue; @@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = ACCESS_ONCE(c->partial); + page = READ_ONCE(c->partial); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) diff --git a/mm/swap_state.c b/mm/swap_state.c index 405923f77334..8bc8e66138da 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) unsigned int pages, max_pages, last_ra; static atomic_t last_readahead_pages; - max_pages = 1 << ACCESS_ONCE(page_cluster); + max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; diff --git a/mm/swapfile.c b/mm/swapfile.c index 63f55ccb9b26..a7e72103f23b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, else continue; } - count = ACCESS_ONCE(si->swap_map[i]); + count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } -- cgit v1.2.3 From ee1462458cb543bbcfd379176bbba0d4bd052b7f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:11 -0700 Subject: fs, jfs: remove slab object constructor Mempools based on slab caches with object constructors are risky because element allocation can happen either from the slab cache itself, meaning the constructor is properly called before returning, or from the mempool reserve pool, meaning the constructor is not called before returning, depending on the allocation context. For this reason, we should disallow creating mempools based on slab caches that have object constructors. Callers of mempool_alloc() will be responsible for properly initializing the returned element. Then, it doesn't matter if the element came from the slab cache or the mempool reserved pool. The only occurrence of a mempool being based on a slab cache with an object constructor in the tree is in fs/jfs/jfs_metapage.c. Remove it and properly initialize the element in alloc_metapage(). At the same time, META_free is never used, so remove it as well. Signed-off-by: David Rientjes Acked-by: Dave Kleikamp Cc: Christoph Hellwig Cc: Sebastian Ott Cc: Mikulas Patocka Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jfs/jfs_metapage.c | 31 ++++++++++++------------------- fs/jfs/jfs_metapage.h | 1 - 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 49ba7ff1bbb9..16a0922beb59 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -183,30 +183,23 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) #endif -static void init_once(void *foo) -{ - struct metapage *mp = (struct metapage *)foo; - - mp->lid = 0; - mp->lsn = 0; - mp->flag = 0; - mp->data = NULL; - mp->clsn = 0; - mp->log = NULL; - set_bit(META_free, &mp->flag); - init_waitqueue_head(&mp->wait); -} - static inline struct metapage *alloc_metapage(gfp_t gfp_mask) { - return mempool_alloc(metapage_mempool, gfp_mask); + struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask); + + if (mp) { + mp->lid = 0; + mp->lsn = 0; + mp->data = NULL; + mp->clsn = 0; + mp->log = NULL; + init_waitqueue_head(&mp->wait); + } + return mp; } static inline void free_metapage(struct metapage *mp) { - mp->flag = 0; - set_bit(META_free, &mp->flag); - mempool_free(mp, metapage_mempool); } @@ -216,7 +209,7 @@ int __init metapage_init(void) * Allocate the metapage structures */ metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), - 0, 0, init_once); + 0, 0, NULL); if (metapage_cache == NULL) return -ENOMEM; diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index a78beda85f68..337e9e51ac06 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -48,7 +48,6 @@ struct metapage { /* metapage flag */ #define META_locked 0 -#define META_free 1 #define META_dirty 2 #define META_sync 3 #define META_discard 4 -- cgit v1.2.3 From e244c9e66f6197f55f6fbb2d5e70714e262cc595 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:14 -0700 Subject: mm, mempool: disallow mempools based on slab caches with constructors All occurrences of mempools based on slab caches with object constructors have been removed from the tree, so disallow creating them. We can only dereference mem->ctor in mm/mempool.c without including mm/slab.h in include/linux/mempool.h. So simply note the restriction, just like the comment restricting usage of __GFP_ZERO, and warn on kernels with CONFIG_DEBUG_VM() if such a mempool is allocated from. We don't want to incur this check on every element allocation, so use VM_BUG_ON(). Signed-off-by: David Rientjes Cc: Dave Kleikamp Cc: Christoph Hellwig Cc: Sebastian Ott Cc: Mikulas Patocka Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempool.h | 3 ++- mm/mempool.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index b19b3023c880..69b6951e8fd2 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -36,7 +36,8 @@ extern void mempool_free(void *element, mempool_t *pool); /* * A mempool_alloc_t and mempool_free_t that get the memory from - * a slab that is passed in through pool_data. + * a slab cache that is passed in through pool_data. + * Note: the slab cache may not have a ctor function. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); diff --git a/mm/mempool.c b/mm/mempool.c index 949970db2874..b60fb85526ed 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -15,6 +15,7 @@ #include #include #include +#include "slab.h" static void add_element(mempool_t *pool, void *element) { @@ -334,6 +335,7 @@ EXPORT_SYMBOL(mempool_free); void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); -- cgit v1.2.3 From bdfedb76f4f5aa5e37380e3b71adee4a39f30fc6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:17 -0700 Subject: mm, mempool: poison elements backed by slab allocator Mempools keep elements in a reserved pool for contexts in which allocation may not be possible. When an element is allocated from the reserved pool, its memory contents is the same as when it was added to the reserved pool. Because of this, elements lack any free poisoning to detect use-after-free errors. This patch adds free poisoning for elements backed by the slab allocator. This is possible because the mempool layer knows the object size of each element. When an element is added to the reserved pool, it is poisoned with POISON_FREE. When it is removed from the reserved pool, the contents are checked for POISON_FREE. If there is a mismatch, a warning is emitted to the kernel log. This is only effective for configs with CONFIG_DEBUG_SLAB or CONFIG_SLUB_DEBUG_ON. [fabio.estevam@freescale.com: use '%zu' for printing 'size_t' variable] [arnd@arndb.de: add missing include] Signed-off-by: David Rientjes Cc: Dave Kleikamp Cc: Christoph Hellwig Cc: Sebastian Ott Cc: Mikulas Patocka Cc: Catalin Marinas Signed-off-by: Fabio Estevam Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index b60fb85526ed..2884d5bad77e 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -6,10 +6,12 @@ * extreme VM load. * * started by Ingo Molnar, Copyright (C) 2001 + * debugging by David Rientjes, Copyright (C) 2015 */ #include #include +#include #include #include #include @@ -17,16 +19,102 @@ #include #include "slab.h" +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +static void poison_error(mempool_t *pool, void *element, size_t size, + size_t byte) +{ + const int nr = pool->curr_nr; + const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); + const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); + int i; + + pr_err("BUG: mempool element poison mismatch\n"); + pr_err("Mempool %p size %zu\n", pool, size); + pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); + for (i = start; i < end; i++) + pr_cont("%x ", *(u8 *)(element + i)); + pr_cont("%s\n", end < size ? "..." : ""); + dump_stack(); +} + +static void __check_element(mempool_t *pool, void *element, size_t size) +{ + u8 *obj = element; + size_t i; + + for (i = 0; i < size; i++) { + u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; + + if (obj[i] != exp) { + poison_error(pool, element, size, i); + return; + } + } + memset(obj, POISON_INUSE, size); +} + +static void check_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) + __check_element(pool, element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->free == mempool_free_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} + +static void __poison_element(void *element, size_t size) +{ + u8 *obj = element; + + memset(obj, POISON_FREE, size - 1); + obj[size - 1] = POISON_END; +} + +static void poison_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + __poison_element(element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->alloc == mempool_alloc_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __poison_element(addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} +#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static inline void check_element(mempool_t *pool, void *element) +{ +} +static inline void poison_element(mempool_t *pool, void *element) +{ +} +#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ + static void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); + poison_element(pool, element); pool->elements[pool->curr_nr++] = element; } static void *remove_element(mempool_t *pool) { - BUG_ON(pool->curr_nr <= 0); - return pool->elements[--pool->curr_nr]; + void *element = pool->elements[--pool->curr_nr]; + + BUG_ON(pool->curr_nr < 0); + check_element(pool, element); + return element; } /** -- cgit v1.2.3 From 65ebb64f4d2ce8eba4d0ec82d6cf65022e70e4a1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:20 -0700 Subject: thp: handle errors in hugepage_init() properly We miss error-handling in few cases hugepage_init(). Let's fix that. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1db93fbda06a..c257006749bb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); static int khugepaged_slab_init(void); +static void khugepaged_slab_exit(void); #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -634,13 +635,15 @@ static int __init hugepage_init(void) err = hugepage_init_sysfs(&hugepage_kobj); if (err) - return err; + goto err_sysfs; err = khugepaged_slab_init(); if (err) - goto out; + goto err_slab; - register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -650,11 +653,18 @@ static int __init hugepage_init(void) if (totalram_pages < (512 << (20 - PAGE_SHIFT))) transparent_hugepage_flags = 0; - start_khugepaged(); + err = start_khugepaged(); + if (err) + goto err_khugepaged; return 0; -out: +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: return err; } subsys_initcall(hugepage_init); @@ -1974,6 +1984,11 @@ static int __init khugepaged_slab_init(void) return 0; } +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ -- cgit v1.2.3 From ae7efa507dee4813ec4bcdadebeec191e247d0c9 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:23 -0700 Subject: thp: do not adjust zone water marks if khugepaged is not started set_recommended_min_free_kbytes() adjusts zone water marks to be suitable for khugepaged. We avoid doing this if khugepaged is disabled, but don't catch the case when khugepaged is failed to start. Let's address this by checking khugepaged_thread instead of khugepaged_enabled() in set_recommended_min_free_kbytes(). It's NULL if the kernel thread is stopped or failed to start. Signed-off-by: Kirill A. Shutemov Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c257006749bb..0af19fffe1df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -110,7 +110,8 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) + /* khugepaged thread has stopped to failed to start */ + if (!khugepaged_thread) return 0; for_each_populated_zone(zone) -- cgit v1.2.3 From 80d6b94bd69a7a49b52bf503ef6a841f43cf5bbb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:26 -0700 Subject: mm, doc: cleanup and clarify munmap behavior for hugetlb memory munmap(2) of hugetlb memory requires a length that is hugepage aligned, otherwise it may fail. Add this to the documentation. This also cleans up the documentation and separates it into logical units: one part refers to MAP_HUGETLB and another part refers to requirements for shared memory segments. Signed-off-by: David Rientjes Cc: Jonathan Corbet Cc: Davide Libenzi Cc: Luiz Capitulino Cc: Shuah Khan Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Joern Engel Cc: Jianguo Wu Cc: Eric B Munson Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index b32b9cd50c0e..030977fb8d2d 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -302,15 +302,23 @@ file systems, write system calls are not. Regular chown, chgrp, and chmod commands (with right permissions) could be used to change the file attributes on hugetlbfs. -Also, it is important to note that no such mount command is required if the +Also, it is important to note that no such mount command is required if applications are going to use only shmat/shmget system calls or mmap with -MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment -should be a member of a supplementary group and system admin needs to -configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for -same or different applications to use any combination of mmaps and shm* -calls, though the mount of filesystem will be required for using mmap calls -without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see -map_hugetlb.c. +MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see map_hugetlb +below. + +Users who wish to use hugetlb memory via shared memory segment should be a +member of a supplementary group and system admin needs to configure that gid +into /proc/sys/vm/hugetlb_shm_group. It is possible for same or different +applications to use any combination of mmaps and shm* calls, though the mount of +filesystem will be required for using mmap calls without MAP_HUGETLB. + +Syscalls that operate on memory backed by hugetlb pages only have their lengths +aligned to the native page size of the processor; they will normally fail with +errno set to EINVAL or exclude hugetlb pages that extend beyond the length if +not hugepage aligned. For example, munmap(2) will fail if memory is backed by +a hugetlb page and the length is smaller than the hugepage size. + Examples ======== -- cgit v1.2.3 From 215ba78115f82ad5f8faedac98cc42e572733b8a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:29 -0700 Subject: mm, selftests: test return value of munmap for MAP_HUGETLB memory When MAP_HUGETLB memory is unmapped, the length must be hugepage aligned, otherwise it fails with -EINVAL. All tests currently behave correctly, but it's better to explcitly test the return value for completeness and document the requirement, especially if users copy map_hugetlb.c as a sample implementation. Signed-off-by: David Rientjes Cc: Jonathan Corbet Cc: Davide Libenzi Cc: Luiz Capitulino Cc: Shuah Khan Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Joern Engel Cc: Jianguo Wu Cc: Eric B Munson Acked-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c | 8 ++++++-- tools/testing/selftests/vm/hugetlbfstest.c | 4 +++- tools/testing/selftests/vm/map_hugetlb.c | 6 +++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c b/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c index 3d8e5b033e1d..49003674de4f 100644 --- a/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c +++ b/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c @@ -21,9 +21,13 @@ static int test_body(void) * Typically the mmap will fail because no huge pages are * allocated on the system. But if there are huge pages * allocated the mmap will succeed. That's fine too, we just - * munmap here before continuing. + * munmap here before continuing. munmap() length of + * MAP_HUGETLB memory must be hugepage aligned. */ - munmap(addr, SIZE); + if (munmap(addr, SIZE)) { + perror("munmap"); + return 1; + } } p = mmap(addr, SIZE, PROT_READ | PROT_WRITE, diff --git a/tools/testing/selftests/vm/hugetlbfstest.c b/tools/testing/selftests/vm/hugetlbfstest.c index ea40ff8c2391..02e1072ec187 100644 --- a/tools/testing/selftests/vm/hugetlbfstest.c +++ b/tools/testing/selftests/vm/hugetlbfstest.c @@ -34,6 +34,7 @@ static void do_mmap(int fd, int extra_flags, int unmap) int *p; int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags; u64 before, after; + int ret; before = read_rss(); p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0); @@ -44,7 +45,8 @@ static void do_mmap(int fd, int extra_flags, int unmap) !"rss didn't grow as expected"); if (!unmap) return; - munmap(p, length); + ret = munmap(p, length); + assert(!ret || !"munmap returned an unexpected error"); after = read_rss(); assert(llabs(after - before) < 0x40000 || !"rss didn't shrink as expected"); diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c index ac56639dd4a9..addcd6fc1ecc 100644 --- a/tools/testing/selftests/vm/map_hugetlb.c +++ b/tools/testing/selftests/vm/map_hugetlb.c @@ -73,7 +73,11 @@ int main(void) write_bytes(addr); ret = read_bytes(addr); - munmap(addr, LENGTH); + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, LENGTH)) { + perror("munmap"); + exit(1); + } return ret; } -- cgit v1.2.3 From 9fcd145717e6496d0c376acb1a5cc551a716c305 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:14:32 -0700 Subject: mm/mmap.c: use while instead of if+goto The creators of the C language gave us the while keyword. Let's use that instead of synthesizing it from if+goto. Made possible by 6597d783397a ("mm/mmap.c: replace find_vma_prepare() with clearer find_vma_links()"). [akpm@linux-foundation.org: fix 80-col overflows] Signed-off-by: Rasmus Villemoes Cc: "Kirill A. Shutemov" Cc: Sasha Levin Cc: Cyrill Gorcunov Cc: Roman Gushchin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e65cbe0d64fc..bb50cacc3ea5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ error = -ENOMEM; -munmap_back: - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; - goto munmap_back; } /* @@ -1571,7 +1570,8 @@ munmap_back: /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, + NULL); if (vma) goto out; @@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* * Clear old maps. this also does some error checking for us */ - munmap_back: - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; - goto munmap_back; } /* Check against address space limits *after* clearing old maps... */ -- cgit v1.2.3 From 822fc61367f062d36c5b5a4d517e9bd2b65a741f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:35 -0700 Subject: mm: don't call __page_cache_release for hugetlb __put_compound_page() calls __page_cache_release() to do some freeing work, but it's obviously for thps, not for hugetlb. We don't care because PageLRU is always cleared and page->mem_cgroup is always NULL for hugetlb. But it's not correct and has potential risks, so let's make it conditional. Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index e3a4feac9b0e..a7251a8ed532 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "internal.h" @@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page) { compound_page_dtor *dtor; - __page_cache_release(page); + /* + * __page_cache_release() is supposed to be called for thp, not for + * hugetlb. This is because hugetlb page does never have PageLRU set + * (it's never listed to any LRU lists) and no memcg routines should + * be called for hugetlb (it has a separate hugetlb_cgroup.) + */ + if (!PageHuge(page)) + __page_cache_release(page); dtor = get_compound_page_dtor(page); (*dtor)(page); } -- cgit v1.2.3 From bcc54222309c70ebcb6c69c156fba4a13dee0a3b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:38 -0700 Subject: mm: hugetlb: introduce page_huge_active We are not safe from calling isolate_huge_page() on a hugepage concurrently, which can make the victim hugepage in invalid state and results in BUG_ON(). The root problem of this is that we don't have any information on struct page (so easily accessible) about hugepages' activeness. Note that hugepages' activeness means just being linked to hstate->hugepage_activelist, which is not the same as normal pages' activeness represented by PageActive flag. Normal pages are isolated by isolate_lru_page() which prechecks PageLRU before isolation, so let's do similarly for hugetlb with a new paeg_huge_active(). set/clear_page_huge_active() should be called within hugetlb_lock. But hugetlb_cow() and hugetlb_no_page() don't do this, being justified because in these functions set_page_huge_active() is called right after the hugepage is allocated and no other thread tries to isolate it. [akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/, make it return bool] [fengguang.wu@intel.com: set_page_huge_active() can be static] Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 ++++++++++++++++++++++++++++++++++++++--- mm/memory-failure.c | 14 ++++++++++++-- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 995c8d65a95c..05407831016b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -924,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } +/* + * Test to determine whether the hugepage is "active/in-use" (i.e. being linked + * to hstate->hugepage_activelist.) + * + * This function can be called for tail pages, but never returns true for them. + */ +bool page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + return PageHead(page) && PagePrivate(&page[1]); +} + +/* never called for tail page */ +static void set_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + SetPagePrivate(&page[1]); +} + +static void clear_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + ClearPagePrivate(&page[1]); +} + void free_huge_page(struct page *page) { /* @@ -952,6 +977,7 @@ void free_huge_page(struct page *page) restore_reserve = true; spin_lock(&hugetlb_lock); + clear_page_huge_active(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); if (restore_reserve) @@ -2972,6 +2998,7 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); + set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3084,6 +3111,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); + set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { int err; @@ -3913,19 +3941,26 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) bool isolate_huge_page(struct page *page, struct list_head *list) { + bool ret = true; + VM_BUG_ON_PAGE(!PageHead(page), page); - if (!get_page_unless_zero(page)) - return false; spin_lock(&hugetlb_lock); + if (!page_huge_active(page) || !get_page_unless_zero(page)) { + ret = false; + goto unlock; + } + clear_page_huge_active(page); list_move_tail(&page->lru, list); +unlock: spin_unlock(&hugetlb_lock); - return true; + return ret; } void putback_active_hugepage(struct page *page) { VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); + set_page_huge_active(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5fd8931d8c31..d9359b770cd9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1586,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags) } unlock_page(hpage); - /* Keep page count to indicate a given hugepage is isolated. */ - list_move(&hpage->lru, &pagelist); + ret = isolate_huge_page(hpage, &pagelist); + if (ret) { + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_page(hpage); + } else { + pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); + return -EBUSY; + } + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { -- cgit v1.2.3 From 7e1f049efb86bd86c06b80eeac0ef80cdeb8c0e7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:41 -0700 Subject: mm: hugetlb: cleanup using paeg_huge_active() Now we have an easy access to hugepages' activeness, so existing helpers to get the information can be cleaned up. [akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/] Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 -- include/linux/page-flags.h | 7 +++++++ mm/hugetlb.c | 42 +++++------------------------------------- mm/memory_hotplug.c | 2 +- 4 files changed, 13 insertions(+), 40 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5713c49a4a5c..205026175c42 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -84,7 +84,6 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); -bool is_hugepage_active(struct page *page); void free_huge_page(struct page *page); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -152,7 +151,6 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) -#define is_hugepage_active(x) false static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 84d10b65cec6..f34e040b34e9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -470,11 +470,18 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); +bool page_huge_active(struct page *page); #else TESTPAGEFLAG_FALSE(Huge) TESTPAGEFLAG_FALSE(HeadHuge) + +static inline bool page_huge_active(struct page *page) +{ + return 0; +} #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05407831016b..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3896,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE -/* Should be called in hugetlb_lock */ -static int is_hugepage_on_freelist(struct page *hpage) -{ - struct page *page; - struct page *tmp; - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - - list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) - if (page == hpage) - return 1; - return 0; -} - /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. @@ -3921,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) int ret = -EBUSY; spin_lock(&hugetlb_lock); - if (is_hugepage_on_freelist(hpage)) { + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { /* * Hwpoisoned hugepage isn't linked to activelist or freelist, * but dangling hpage->lru can trigger list-debug warnings @@ -3965,25 +3955,3 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } - -bool is_hugepage_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - /* - * This function can be called for a tail page because the caller, - * scan_movable_pages, scans through a given pfn-range which typically - * covers one memory block. In systems using gigantic hugepage (1GB - * for x86_64,) a hugepage is larger than a memory block, and we don't - * support migrating such large hugepages for now, so return false - * when called for tail pages. - */ - if (PageTail(page)) - return false; - /* - * Refcount of a hwpoisoned hugepages is 1, but they are not active, - * so we should return false for them. - */ - if (unlikely(PageHWPoison(page))) - return false; - return page_count(page) > 0; -} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2e8014fb755..457bde530cbe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (PageLRU(page)) return pfn; if (PageHuge(page)) { - if (is_hugepage_active(page)) + if (page_huge_active(page)) return pfn; else pfn = round_up(pfn + 1, -- cgit v1.2.3 From 6a4055bc72a516bae3f607eda3e243d18e66bd49 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Wed, 15 Apr 2015 16:14:44 -0700 Subject: mm/memblock.c: add debug output for memblock_add() memblock_reserve() calls memblock_reserve_region() which prints debugging information if 'memblock=debug' was passed on the command line. This patch adds the same behaviour, but for memblock_add function(). [akpm@linux-foundation.org: s/memblock_memory/memblock_add/ in message] Signed-off-by: Alexander Kuleshov Cc: Martin Schwidefsky Cc: Philipp Hachtmann Cc: Fabian Frederick Cc: Catalin Marinas Cc: Emil Medve Cc: Akinobu Mita Cc: Tang Chen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 3f37a0bca5d5..9318b567ed79 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, return memblock_add_range(&memblock.memory, base, size, nid, 0); } +static int __init_memblock memblock_add_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) +{ + struct memblock_type *_rgn = &memblock.memory; + + memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", + (unsigned long long)base, + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); + + return memblock_add_range(_rgn, base, size, nid, flags); +} + int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_range(&memblock.memory, base, size, - MAX_NUMNODES, 0); + return memblock_add_region(base, size, MAX_NUMNODES, 0); } /** -- cgit v1.2.3 From cdd7875e0c4db5c41e28b645d3bf7d41bd2cbb45 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Apr 2015 16:14:47 -0700 Subject: include/linux/mm.h: simplify flag check Flip the flag test so that it is the simplest. No functional change, just a small readability improvement: No code changed: # arch/x86/kernel/sys_x86_64.o: text data bss dec hex filename 1551 24 0 1575 627 sys_x86_64.o.before 1551 24 0 1575 627 sys_x86_64.o.after md5: 70708d1b1ad35cc891118a69dc1a63f9 sys_x86_64.o.before.asm 70708d1b1ad35cc891118a69dc1a63f9 sys_x86_64.o.after.asm Signed-off-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 515ec5045b60..68c21b2374c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1894,10 +1894,10 @@ extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { - if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN)) - return unmapped_area(info); - else + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) return unmapped_area_topdown(info); + else + return unmapped_area(info); } /* truncate.c */ -- cgit v1.2.3 From 99e8ea6cd2210cf2271f922384b483cd83f0f8f3 Mon Sep 17 00:00:00 2001 From: Stefan Strogin Date: Wed, 15 Apr 2015 16:14:50 -0700 Subject: mm: cma: add trace events for CMA allocations and freeings Add trace events for cma_alloc() and cma_release(). The cma_alloc tracepoint is used both for successful and failed allocations, in case of allocation failure pfn=-1UL is stored and printed. Signed-off-by: Stefan Strogin Cc: Ingo Molnar Cc: Steven Rostedt Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Laurent Pinchart Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ mm/cma.c | 5 ++++ 2 files changed, 71 insertions(+) create mode 100644 include/trace/events/cma.h diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h new file mode 100644 index 000000000000..d7cd961720a7 --- /dev/null +++ b/include/trace/events/cma.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cma + +#if !defined(_TRACE_CMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CMA_H + +#include +#include + +TRACE_EVENT(cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(const struct page *, page) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->page = page; + __entry->count = count; + __entry->align = align; + ), + + TP_printk("pfn=%lx page=%p count=%u align=%u", + __entry->pfn, + __entry->page, + __entry->count, + __entry->align) +); + +TRACE_EVENT(cma_release, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count), + + TP_ARGS(pfn, page, count), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(const struct page *, page) + __field(unsigned int, count) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->page = page; + __entry->count = count; + ), + + TP_printk("pfn=%lx page=%p count=%u", + __entry->pfn, + __entry->page, + __entry->count) +); + +#endif /* _TRACE_CMA_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/cma.c b/mm/cma.c index 47203faaf65e..3a7a67b93394 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -23,6 +23,7 @@ # define DEBUG #endif #endif +#define CREATE_TRACE_POINTS #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include "cma.h" @@ -414,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) start = bitmap_no + mask + 1; } + trace_cma_alloc(page ? pfn : -1UL, page, count, align); + pr_debug("%s(): returned %p\n", __func__, page); return page; } @@ -446,6 +450,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); + trace_cma_release(pfn, pages, count); return true; } -- cgit v1.2.3 From e39155ea11eac6da056b04669d7c9fc612e2065a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:53 -0700 Subject: mm: uninline and cleanup page-mapping related helpers Most-used page->mapping helper -- page_mapping() -- has already uninlined. Let's uninline also page_rmapping() and page_anon_vma(). It saves us depending on configuration around 400 bytes in text: text data bss dec hex filename 660318 99254 410000 1169572 11d8a4 mm/built-in.o-before 659854 99254 410000 1169108 11d6d4 mm/built-in.o I also tried to make code a bit more clean. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++------ include/linux/rmap.h | 8 -------- mm/util.c | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 68c21b2374c4..0e7bb2194da5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -950,14 +950,10 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif +extern void *page_rmapping(struct page *page); +extern struct anon_vma *page_anon_vma(struct page *page); extern struct address_space *page_mapping(struct page *page); -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -static inline void *page_rmapping(struct page *page) -{ - return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); -} - extern struct address_space *__page_file_mapping(struct page *); static inline diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c4c559a45dc8..c89c53a113a8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,14 +105,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) __put_anon_vma(anon_vma); } -static inline struct anon_vma *page_anon_vma(struct page *page) -{ - if (((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != - PAGE_MAPPING_ANON) - return NULL; - return page_rmapping(page); -} - static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; diff --git a/mm/util.c b/mm/util.c index 3981ae9d1b15..68ff8a5361e7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -325,9 +325,37 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +static inline void *__page_rmapping(struct page *page) +{ + unsigned long mapping; + + mapping = (unsigned long)page->mapping; + mapping &= ~PAGE_MAPPING_FLAGS; + + return (void *)mapping; +} + +/* Neutral page->mapping pointer to address_space or anon_vma or other */ +void *page_rmapping(struct page *page) +{ + page = compound_head(page); + return __page_rmapping(page); +} + +struct anon_vma *page_anon_vma(struct page *page) +{ + unsigned long mapping; + + page = compound_head(page); + mapping = (unsigned long)page->mapping; + if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + return NULL; + return __page_rmapping(page); +} + struct address_space *page_mapping(struct page *page) { - struct address_space *mapping = page->mapping; + unsigned long mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) swp_entry_t entry; entry.val = page_private(page); - mapping = swap_address_space(entry); - } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; + return swap_address_space(entry); + } + + mapping = (unsigned long)page->mapping; + if (mapping & PAGE_MAPPING_FLAGS) + return NULL; + return page->mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, -- cgit v1.2.3 From 79553da293d38d63097278de13e28a3b371f43c1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:56 -0700 Subject: thp: cleanup khugepaged startup Few trivial cleanups: - no need to call set_recommended_min_free_kbytes() from late_initcall() -- start_khugepaged() calls it; - no need to call set_recommended_min_free_kbytes() from start_khugepaged() if khugepaged is not started; - there isn't much point in running start_khugepaged() if we've just set transparent_hugepage_flags to zero; - start_khugepaged() is misnamed -- it also used to stop the thread; Signed-off-by: Kirill A. Shutemov Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0af19fffe1df..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -110,10 +110,6 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - /* khugepaged thread has stopped to failed to start */ - if (!khugepaged_thread) - return 0; - for_each_populated_zone(zone) nr_zones++; @@ -145,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) setup_per_zone_wmarks(); return 0; } -late_initcall(set_recommended_min_free_kbytes); -static int start_khugepaged(void) +static int start_stop_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { @@ -158,6 +153,7 @@ static int start_khugepaged(void) pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; + goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) @@ -168,7 +164,7 @@ static int start_khugepaged(void) kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } - +fail: return err; } @@ -302,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, int err; mutex_lock(&khugepaged_mutex); - err = start_khugepaged(); + err = start_stop_khugepaged(); mutex_unlock(&khugepaged_mutex); if (err) @@ -651,10 +647,12 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; + return 0; + } - err = start_khugepaged(); + err = start_stop_khugepaged(); if (err) goto err_khugepaged; -- cgit v1.2.3 From 2e32b947606daa642a28bfb9b57e9702cfc9a8b3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 15 Apr 2015 16:14:59 -0700 Subject: mm: cma: add functions to get region pages counters Here are two functions that provide interface to compute/get used size and size of biggest free chunk in cma region. Add that information to debugfs. [akpm@linux-foundation.org: move debug code from cma.c into cma_debug.c] [stefan.strogin@gmail.com: move code from cma_get_used() and cma_get_maxchunk() to cma_used_get() and cma_maxchunk_get()] Signed-off-by: Dmitry Safonov Signed-off-by: Stefan Strogin Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Pintu Kumar Cc: Weijie Yang Cc: Laurent Pinchart Cc: Vyacheslav Tyrtov Cc: Aleksei Mateosian Signed-off-by: Stefan Strogin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 0b377536ccde..a040bfb132c3 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -33,6 +33,44 @@ static int cma_debugfs_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); +static int cma_used_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long used; + + mutex_lock(&cma->lock); + /* pages counter is smaller than sizeof(int) */ + used = bitmap_weight(cma->bitmap, (int)cma->count); + mutex_unlock(&cma->lock); + *val = (u64)used << cma->order_per_bit; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); + +static int cma_maxchunk_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long maxchunk = 0; + unsigned long start, end = 0; + + mutex_lock(&cma->lock); + for (;;) { + start = find_next_zero_bit(cma->bitmap, cma->count, end); + if (start >= cma->count) + break; + end = find_next_bit(cma->bitmap, cma->count, start); + maxchunk = max(end - start, maxchunk); + } + mutex_unlock(&cma->lock); + *val = (u64)maxchunk << cma->order_per_bit; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); + static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) { spin_lock(&cma->mem_head_lock); @@ -149,6 +187,8 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) &cma->count, &cma_debugfs_fops); debugfs_create_file("order_per_bit", S_IRUGO, tmp, &cma->order_per_bit, &cma_debugfs_fops); + debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); + debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); -- cgit v1.2.3 From bda6d33042a486c8f7b15bf15a80fd07d4eab204 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Apr 2015 16:15:02 -0700 Subject: mm/cma_debug.c: remove blank lines before DEFINE_SIMPLE_ATTRIBUTE() Like EXPORT_SYMBOL(): the positioning communicates that the macro pertains to the immediately preceding function. Cc: Dmitry Safonov Cc: Michal Nazarewicz Cc: Stefan Strogin Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Pintu Kumar Cc: Weijie Yang Cc: Laurent Pinchart Cc: Vyacheslav Tyrtov Cc: Aleksei Mateosian Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index a040bfb132c3..7621ee34daa0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -30,7 +30,6 @@ static int cma_debugfs_get(void *data, u64 *val) return 0; } - DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); static int cma_used_get(void *data, u64 *val) @@ -46,7 +45,6 @@ static int cma_used_get(void *data, u64 *val) return 0; } - DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); static int cma_maxchunk_get(void *data, u64 *val) @@ -68,7 +66,6 @@ static int cma_maxchunk_get(void *data, u64 *val) return 0; } - DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) @@ -129,7 +126,6 @@ static int cma_free_write(void *data, u64 val) return cma_free_mem(cma, pages); } - DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); static int cma_alloc_mem(struct cma *cma, int count) @@ -162,7 +158,6 @@ static int cma_alloc_write(void *data, u64 val) return cma_alloc_mem(cma, pages); } - DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, int idx) -- cgit v1.2.3 From 923936157b158f36bd6a3d86496dce82b1a957de Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Apr 2015 16:15:05 -0700 Subject: mm/mempool.c: kasan: poison mempool elements Mempools keep allocated objects in reserved for situations when ordinary allocation may not be possible to satisfy. These objects shouldn't be accessed before they leave the pool. This patch poison elements when get into the pool and unpoison when they leave it. This will let KASan to detect use-after-free of mempool's elements. Signed-off-by: Andrey Ryabinin Tested-by: David Rientjes Cc: Catalin Marinas Cc: Dmitry Chernenkov Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ mm/kasan/kasan.c | 13 +++++++++++++ mm/mempool.c | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5bb074431eb0..5486d777b706 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -44,6 +44,7 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object); void kasan_kmalloc_large(const void *ptr, size_t size); void kasan_kfree_large(const void *ptr); +void kasan_kfree(void *ptr); void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); void kasan_krealloc(const void *object, size_t new_size); @@ -71,6 +72,7 @@ static inline void kasan_poison_object_data(struct kmem_cache *cache, static inline void kasan_kmalloc_large(void *ptr, size_t size) {} static inline void kasan_kfree_large(const void *ptr) {} +static inline void kasan_kfree(void *ptr) {} static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size) {} static inline void kasan_krealloc(const void *object, size_t new_size) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 936d81661c47..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) kasan_kmalloc(page->slab_cache, object, size); } +void kasan_kfree(void *ptr) +{ + struct page *page; + + page = virt_to_head_page(ptr); + + if (unlikely(!PageSlab(page))) + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); + else + kasan_slab_free(page->slab_cache, ptr); +} + void kasan_kfree_large(const void *ptr) { struct page *page = virt_to_page(ptr); diff --git a/mm/mempool.c b/mm/mempool.c index 2884d5bad77e..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -101,10 +102,31 @@ static inline void poison_element(mempool_t *pool, void *element) } #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static void kasan_poison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_free(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_kfree(element); + if (pool->alloc == mempool_alloc_pages) + kasan_free_pages(element, (unsigned long)pool->pool_data); +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_alloc(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_krealloc(element, (size_t)pool->pool_data); + if (pool->alloc == mempool_alloc_pages) + kasan_alloc_pages(element, (unsigned long)pool->pool_data); +} + static void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); + kasan_poison_element(pool, element); pool->elements[pool->curr_nr++] = element; } @@ -114,6 +136,7 @@ static void *remove_element(mempool_t *pool) BUG_ON(pool->curr_nr < 0); check_element(pool, element); + kasan_unpoison_element(pool, element); return element; } -- cgit v1.2.3 From 2682582a6ea118d974c33da64923ae8c687fdd0b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 Apr 2015 16:15:08 -0700 Subject: mm/memory: also print a_ops->readpage in print_bad_pte() A lot of filesystems use generic_file_mmap() and filemap_fault(), f_op->mmap and vm_ops->fault aren't enough to identify filesystem. This prints file name, vm_ops->fault, f_op->mmap and a_ops->readpage (which is almost always implemented and filesystem-specific). Example: [ 23.676410] BUG: Bad page map in process sh pte:1b7e6025 pmd:19bbd067 [ 23.676887] page:ffffea00006df980 count:4 mapcount:1 mapping:ffff8800196426c0 index:0x97 [ 23.677481] flags: 0x10000000000000c(referenced|uptodate) [ 23.677896] page dumped because: bad pte [ 23.678205] addr:00007f52fcb17000 vm_flags:00000075 anon_vma: (null) mapping:ffff8800196426c0 index:97 [ 23.678922] file:libc-2.19.so fault:filemap_fault mmap:generic_file_readonly_mmap readpage:v9fs_vfs_readpage [akpm@linux-foundation.org: use pr_alert, per Kirill] Signed-off-by: Konstantin Khlebnikov Cc: Sasha Levin Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 656593f73c8e..f9628e568c58 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ - if (vma->vm_ops) - printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", - vma->vm_ops->fault); - if (vma->vm_file) - printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", - vma->vm_file->f_op->mmap); + pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", + vma->vm_file, + vma->vm_ops ? vma->vm_ops->fault : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + mapping ? mapping->a_ops->readpage : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -- cgit v1.2.3 From dd9061846a3ba01b0fa45423aaa087e4a69187fa Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 15 Apr 2015 16:15:11 -0700 Subject: mm: new pfn_mkwrite same as page_mkwrite for VM_PFNMAP This will allow FS that uses VM_PFNMAP | VM_MIXEDMAP (no page structs) to get notified when access is a write to a read-only PFN. This can happen if we mmap() a file then first mmap-read from it to page-in a read-only PFN, than we mmap-write to the same page. We need this functionality to fix a DAX bug, where in the scenario above we fail to set ctime/mtime though we modified the file. An xfstest is attached to this patchset that shows the failure and the fix. (A DAX patch will follow) This functionality is extra important for us, because upon dirtying of a pmem page we also want to RDMA the page to a remote cluster node. We define a new pfn_mkwrite and do not reuse page_mkwrite because 1 - The name ;-) 2 - But mainly because it would take a very long and tedious audit of all page_mkwrite functions of VM_MIXEDMAP/VM_PFNMAP users. To make sure they do not now CRASH. For example current DAX code (which this is for) would crash. If we would want to reuse page_mkwrite, We will need to first patch all users, so to not-crash-on-no-page. Then enable this patch. But even if I did that I would not sleep so well at night. Adding a new vector is the safest thing to do, and is not that expensive. an extra pointer at a static function vector per driver. Also the new vector is better for performance, because else we Will call all current Kernel vectors, so to: check-ha-no-page-do-nothing and return. No need to call it from do_shared_fault because do_wp_page is called to change pte permissions anyway. Signed-off-by: Yigal Korman Signed-off-by: Boaz Harrosh Acked-by: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Jan Kara Cc: Hugh Dickins Cc: Mel Gorman Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 8 ++++++++ include/linux/mm.h | 3 +++ mm/memory.c | 43 +++++++++++++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f91926f2f482..8bb8a7ee0f99 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -525,6 +525,7 @@ prototypes: void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); + int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: @@ -534,6 +535,7 @@ close: yes fault: yes can return with page locked map_pages: yes page_mkwrite: yes can return with page locked +pfn_mkwrite: yes access: yes ->fault() is called when a previously not present pte is about @@ -560,6 +562,12 @@ the page has been truncated, the filesystem should not look up a new page like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to retry the fault. + ->pfn_mkwrite() is the same as page_mkwrite but when the pte is +VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is +VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior +after this call is to make the pte read-write, unless pfn_mkwrite returns +an error. + ->access() is called when get_user_pages() fails in access_process_vm(), typically used to debug a process through /proc/pid/mem or ptrace. This function is needed only for diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e7bb2194da5..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -251,6 +251,9 @@ struct vm_operations_struct { * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ diff --git a/mm/memory.c b/mm/memory.c index f9628e568c58..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2180,6 +2180,42 @@ oom: return VM_FAULT_OOM; } +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static int wp_pfn_shared(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + pmd_t *pmd) +{ + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { + struct vm_fault vmf = { + .page = NULL, + .pgoff = linear_page_index(vma, address), + .virtual_address = (void __user *)(address & PAGE_MASK), + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, + }; + int ret; + + pte_unmap_unlock(page_table, ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + if (ret & VM_FAULT_ERROR) + return ret; + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * We might have raced with another page fault while we + * released the pte_offset_map_lock. + */ + if (!pte_same(*page_table, orig_pte)) { + pte_unmap_unlock(page_table, ptl); + return 0; + } + } + return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, + NULL, 0, 0); +} + static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, @@ -2258,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable as we can't do any dirty - * accounting on raw pfn maps. + * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); + return wp_pfn_shared(mm, vma, address, page_table, ptl, + orig_pte, pmd); pte_unmap_unlock(page_table, ptl); return wp_page_copy(mm, vma, address, page_table, pmd, -- cgit v1.2.3 From 0e3b210ce1722168227cb3bc7746256d0c0afece Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 15 Apr 2015 16:15:14 -0700 Subject: dax: use pfn_mkwrite to update c/mtime + freeze protection From: Yigal Korman [v1] Without this patch, c/mtime is not updated correctly when mmap'ed page is first read from and then written to. A new xfstest is submitted for testing this (generic/080) [v2] Jan Kara has pointed out that if we add the sb_start/end_pagefault pair in the new pfn_mkwrite we are then fixing another bug where: A user could start writing to the page while filesystem is frozen. Signed-off-by: Yigal Korman Signed-off-by: Boaz Harrosh Reviewed-by: Jan Kara Cc: Matthew Wilcox Cc: Dave Chinner Cc: Hugh Dickins Cc: Mel Gorman Cc: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 17 +++++++++++++++++ fs/ext2/file.c | 1 + fs/ext4/file.c | 1 + include/linux/fs.h | 1 + 4 files changed, 20 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index ed1619ec6537..d0bd1f4f81b3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -463,6 +463,23 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +/** + * dax_pfn_mkwrite - handle first write to DAX page + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * + */ +int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + sb_end_pagefault(sb); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated diff --git a/fs/ext2/file.c b/fs/ext2/file.c index e31701713516..866a3ce3f864 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -39,6 +39,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .page_mkwrite = ext2_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 598abbbe6786..aa78c70553f4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -206,6 +206,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .page_mkwrite = ext4_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops diff --git a/include/linux/fs.h b/include/linux/fs.h index 60733bdc74b4..0f696328f218 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2620,6 +2620,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) #ifdef CONFIG_BLOCK -- cgit v1.2.3 From be64f884bed729b5d127db6a737155a4e514d286 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 15 Apr 2015 16:15:17 -0700 Subject: dax: unify ext2/4_{dax,}_file_operations The original dax patchset split the ext2/4_file_operations because of the two NULL splice_read/splice_write in the dax case. In the vfs if splice_read/splice_write are NULL we then call default_splice_read/write. What we do here is make generic_file_splice_read aware of IS_DAX() so the original ext2/4_file_operations can be used as is. For write it appears that iter_file_splice_write is just fine. It uses the regular f_op->write(file,..) or new_sync_write(file, ...). Signed-off-by: Boaz Harrosh Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Mel Gorman Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/ext2.h | 1 - fs/ext2/file.c | 18 ------------------ fs/ext2/inode.c | 5 +---- fs/ext2/namei.c | 10 ++-------- fs/ext4/ext4.h | 1 - fs/ext4/file.c | 20 -------------------- fs/ext4/inode.c | 5 +---- fs/ext4/namei.c | 10 ++-------- fs/splice.c | 3 +++ 9 files changed, 9 insertions(+), 64 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 678f9ab08c48..8d15febd0aa3 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -793,7 +793,6 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 866a3ce3f864..19cac93a65d3 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -109,24 +109,6 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_FS_DAX -const struct file_operations ext2_dax_file_operations = { - .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .unlocked_ioctl = ext2_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext2_compat_ioctl, -#endif - .mmap = ext2_file_mmap, - .open = dquot_file_open, - .release = ext2_release_file, - .fsync = ext2_fsync, -}; -#endif - const struct inode_operations ext2_file_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index df9d6afbc5d5..b29eb6747116 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1388,10 +1388,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 148f6e3789ea..ce422931f411 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -104,10 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { @@ -125,10 +122,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f63c3d5805c4..8a3981ea35d8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2593,7 +2593,6 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; -extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index aa78c70553f4..e6d4280d66be 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -625,26 +625,6 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; -#ifdef CONFIG_FS_DAX -const struct file_operations ext4_dax_file_operations = { - .llseek = ext4_llseek, - .read = new_sync_read, - .write = new_sync_write, - .read_iter = generic_file_read_iter, - .write_iter = ext4_file_write_iter, - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .mmap = ext4_file_mmap, - .open = ext4_file_open, - .release = ext4_release_file, - .fsync = ext4_sync_file, - /* Splice not yet supported with DAX */ - .fallocate = ext4_fallocate, -}; -#endif - const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a3f451370bef..035b7a06f1c3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4090,10 +4090,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 28fe71a2904c..2291923dae4e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,10 +2235,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2302,10 +2299,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/splice.c b/fs/splice.c index 41cbb16299e0..476024bb6546 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -523,6 +523,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, loff_t isize, left; int ret; + if (IS_DAX(in->f_mapping->host)) + return default_file_splice_read(in, ppos, pipe, len, flags); + isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; -- cgit v1.2.3 From 018e9a49a554d915ba945a5faf34c592d65fe575 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Apr 2015 16:15:20 -0700 Subject: mm/compaction.c: fix "suitable_migration_target() unused" warning mm/compaction.c:250:13: warning: 'suitable_migration_target' defined but not used [-Wunused-function] Reported-by: Fengguang Wu Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e6c4f9475d43..018f08da99a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc) return false; } -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - /* If the page is a large free page, then disallow migration */ - if (PageBuddy(page)) { - /* - * We are checking page_order without zone->lock taken. But - * the only small danger is that we skip a potentially suitable - * pageblock, so it's not worth to check order for valid range. - */ - if (page_order_unsafe(page) >= pageblock_order) - return false; - } - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) - return true; - - /* Otherwise skip the block */ - return false; -} - /* * Isolate free pages onto a private freelist. If @strict is true, will abort * returning 0 on any invalid PFNs or non-free pages inside of the pageblock @@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (page_order_unsafe(page) >= pageblock_order) + return false; + } + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ + return false; +} + /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. -- cgit v1.2.3 From 2e40e163a25af3bd35d128d3e2e005916de5cce6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:23 -0700 Subject: zsmalloc: decouple handle and object Recently, we started to use zram heavily and some of issues popped. 1) external fragmentation I got a report from Juneho Choi that fork failed although there are plenty of free pages in the system. His investigation revealed zram is one of the culprit to make heavy fragmentation so there was no more contiguous 16K page for pgd to fork in the ARM. 2) non-movable pages Other problem of zram now is that inherently, user want to use zram as swap in small memory system so they use zRAM with CMA to use memory efficiently. However, unfortunately, it doesn't work well because zRAM cannot use CMA's movable pages unless it doesn't support compaction. I got several reports about that OOM happened with zram although there are lots of swap space and free space in CMA area. 3) internal fragmentation zRAM has started support memory limitation feature to limit memory usage and I sent a patchset(https://lkml.org/lkml/2014/9/21/148) for VM to be harmonized with zram-swap to stop anonymous page reclaim if zram consumed memory up to the limit although there are free space on the swap. One problem for that direction is zram has no way to know any hole in memory space zsmalloc allocated by internal fragmentation so zram would regard swap is full although there are free space in zsmalloc. For solving the issue, zram want to trigger compaction of zsmalloc before it decides full or not. This patchset is first step to support above issues. For that, it adds indirect layer between handle and object location and supports manual compaction to solve 3th problem first of all. After this patchset got merged, next step is to make VM aware of zsmalloc compaction so that generic compaction will move zsmalloced-pages automatically in runtime. In my imaginary experiment(ie, high compress ratio data with heavy swap in/out on 8G zram-swap), data is as follows, Before = zram allocated object : 60212066 bytes zram total used: 140103680 bytes ratio: 42.98 percent MemFree: 840192 kB Compaction After = frag ratio after compaction zram allocated object : 60212066 bytes zram total used: 76185600 bytes ratio: 79.03 percent MemFree: 901932 kB Juneho reported below in his real platform with small aging. So, I think the benefit would be bigger in real aging system for a long time. - frag_ratio increased 3% (ie, higher is better) - memfree increased about 6MB - In buddy info, Normal 2^3: 4, 2^2: 1: 2^1 increased, Highmem: 2^1 21 increased frag ratio after swap fragment used : 156677 kbytes total: 166092 kbytes frag_ratio : 94 meminfo before compaction MemFree: 83724 kB Node 0, zone Normal 13642 1364 57 10 61 17 9 5 4 0 0 Node 0, zone HighMem 425 29 1 0 0 0 0 0 0 0 0 num_migrated : 23630 compaction done frag ratio after compaction used : 156673 kbytes total: 160564 kbytes frag_ratio : 97 meminfo after compaction MemFree: 89060 kB Node 0, zone Normal 14076 1544 67 14 61 17 9 5 4 0 0 Node 0, zone HighMem 863 50 1 0 0 0 0 0 0 0 0 This patchset adds more logics(about 480 lines) in zsmalloc but when I tested heavy swapin/out program, the regression for swapin/out speed is marginal because most of overheads were caused by compress/decompress and other MM reclaim stuff. This patch (of 7): Currently, handle of zsmalloc encodes object's location directly so it makes support of migration hard. This patch decouples handle and object via adding indirect layer. For that, it allocates handle dynamically and returns it to user. The handle is the address allocated by slab allocation so it's unique and we could keep object's location in the memory space allocated for handle. With it, we can change object's position without changing handle itself. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 126 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 98 insertions(+), 28 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0dec1fa5f656..6f3cfbf5e237 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -110,6 +110,8 @@ #define ZS_MAX_ZSPAGE_ORDER 2 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + /* * Object location (, ) is encoded as * as single (unsigned long) handle value. @@ -140,7 +142,8 @@ /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -#define ZS_MAX_ALLOC_SIZE PAGE_SIZE +/* each chunk includes extra space to keep handle */ +#define ZS_MAX_ALLOC_SIZE (PAGE_SIZE + ZS_HANDLE_SIZE) /* * On systems with 4K page size, this gives 255 size classes! There is a @@ -233,14 +236,24 @@ struct size_class { * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; + union { + /* + * Position of next free chunk (encodes ) + * It's valid for non-allocated object + */ + void *next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; }; struct zs_pool { char *name; struct size_class **size_class; + struct kmem_cache *handle_cachep; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; @@ -269,6 +282,34 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +static int create_handle_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + return pool->handle_cachep ? 0 : 1; +} + +static void destroy_handle_cache(struct zs_pool *pool) +{ + kmem_cache_destroy(pool->handle_cachep); +} + +static unsigned long alloc_handle(struct zs_pool *pool) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + pool->flags & ~__GFP_HIGHMEM); +} + +static void free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static void record_obj(unsigned long handle, unsigned long obj) +{ + *(unsigned long *)handle = obj; +} + /* zpool driver */ #ifdef CONFIG_ZPOOL @@ -595,13 +636,18 @@ static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) * decoded obj_idx back to its original value since it was adjusted in * obj_location_to_handle(). */ -static void obj_handle_to_location(unsigned long handle, struct page **page, +static void obj_to_location(unsigned long handle, struct page **page, unsigned long *obj_idx) { *page = pfn_to_page(handle >> OBJ_INDEX_BITS); *obj_idx = (handle & OBJ_INDEX_MASK) - 1; } +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + static unsigned long obj_idx_to_offset(struct page *page, unsigned long obj_idx, int class_size) { @@ -860,12 +906,16 @@ static void __zs_unmap_object(struct mapping_area *area, { int sizes[2]; void *addr; - char *buf = area->vm_buf; + char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; + buf = area->vm_buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -1153,13 +1203,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; + void *ret; BUG_ON(!handle); @@ -1170,7 +1221,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - obj_handle_to_location(handle, &page, &obj_idx); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); @@ -1180,7 +1232,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; + ret = area->vm_addr + off; + goto out; } /* this object spans two pages */ @@ -1188,14 +1241,16 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, pages[1] = get_next_page(page); BUG_ON(!pages[1]); - return __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, pages, off, class->size); +out: + return ret + ZS_HANDLE_SIZE; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; @@ -1204,7 +1259,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) BUG_ON(!handle); - obj_handle_to_location(handle, &page, &obj_idx); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); @@ -1236,7 +1292,7 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); */ unsigned long zs_malloc(struct zs_pool *pool, size_t size) { - unsigned long obj; + unsigned long handle, obj; struct link_free *link; struct size_class *class; void *vaddr; @@ -1244,9 +1300,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) struct page *first_page, *m_page; unsigned long m_objidx, m_offset; - if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + if (unlikely(!size || (size + ZS_HANDLE_SIZE) > ZS_MAX_ALLOC_SIZE)) + return 0; + + handle = alloc_handle(pool); + if (!handle) return 0; + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); @@ -1255,8 +1317,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); first_page = alloc_zspage(class, pool->flags); - if (unlikely(!first_page)) + if (unlikely(!first_page)) { + free_handle(pool, handle); return 0; + } set_zspage_mapping(first_page, class->index, ZS_EMPTY); atomic_long_add(class->pages_per_zspage, @@ -1268,40 +1332,45 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) } obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); + obj_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); + + /* record handle in the header of allocated chunk */ + link->handle = handle; kunmap_atomic(vaddr); first_page->inuse++; zs_stat_inc(class, OBJ_USED, 1); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(pool, first_page); + record_obj(handle, obj); spin_unlock(&class->lock); - return obj; + return handle; } EXPORT_SYMBOL_GPL(zs_malloc); -void zs_free(struct zs_pool *pool, unsigned long obj) +void zs_free(struct zs_pool *pool, unsigned long handle) { struct link_free *link; struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; + unsigned long obj, f_objidx, f_offset; void *vaddr; int class_idx; struct size_class *class; enum fullness_group fullness; - if (unlikely(!obj)) + if (unlikely(!handle)) return; - obj_handle_to_location(obj, &f_page, &f_objidx); + obj = handle_to_obj(handle); + free_handle(pool, handle); + obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); @@ -1355,20 +1424,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) if (!pool) return NULL; - pool->name = kstrdup(name, GFP_KERNEL); - if (!pool->name) { - kfree(pool); - return NULL; - } - pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { - kfree(pool->name); kfree(pool); return NULL; } + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_handle_cache(pool)) + goto err; + /* * Iterate reversly, because, size of size_class that we want to use * for merging should be larger or equal to current size. @@ -1450,6 +1519,7 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } + destroy_handle_cache(pool); kfree(pool->size_class); kfree(pool->name); kfree(pool); -- cgit v1.2.3 From c78062612fb525430b775a0bef4d3cc07e512da0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:26 -0700 Subject: zsmalloc: factor out obj_[malloc|free] In later patch, migration needs some part of functions in zs_malloc and zs_free so this patch factor out them. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 98 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 6f3cfbf5e237..55b171016f4f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -525,11 +525,10 @@ static void remove_zspage(struct page *page, struct size_class *class, * page from the freelist of the old fullness group to that of the new * fullness group. */ -static enum fullness_group fix_fullness_group(struct zs_pool *pool, +static enum fullness_group fix_fullness_group(struct size_class *class, struct page *page) { int class_idx; - struct size_class *class; enum fullness_group currfg, newfg; BUG_ON(!is_first_page(page)); @@ -539,7 +538,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, if (newfg == currfg) goto out; - class = pool->size_class[class_idx]; remove_zspage(page, class, currfg); insert_zspage(page, class, newfg); set_zspage_mapping(page, class_idx, newfg); @@ -1281,6 +1279,33 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +static unsigned long obj_malloc(struct page *first_page, + struct size_class *class, unsigned long handle) +{ + unsigned long obj; + struct link_free *link; + + struct page *m_page; + unsigned long m_objidx, m_offset; + void *vaddr; + + obj = (unsigned long)first_page->freelist; + obj_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + first_page->freelist = link->next; + /* record handle in the header of allocated chunk */ + link->handle = handle; + kunmap_atomic(vaddr); + first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); + + return obj; +} + + /** * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from @@ -1293,12 +1318,8 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); unsigned long zs_malloc(struct zs_pool *pool, size_t size) { unsigned long handle, obj; - struct link_free *link; struct size_class *class; - void *vaddr; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; + struct page *first_page; if (unlikely(!size || (size + ZS_HANDLE_SIZE) > ZS_MAX_ALLOC_SIZE)) return 0; @@ -1331,22 +1352,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) class->size, class->pages_per_zspage)); } - obj = (unsigned long)first_page->freelist; - obj_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - vaddr = kmap_atomic(m_page); - link = (struct link_free *)vaddr + m_offset / sizeof(*link); - first_page->freelist = link->next; - - /* record handle in the header of allocated chunk */ - link->handle = handle; - kunmap_atomic(vaddr); - - first_page->inuse++; - zs_stat_inc(class, OBJ_USED, 1); + obj = obj_malloc(first_page, class, handle); /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); + fix_fullness_group(class, first_page); record_obj(handle, obj); spin_unlock(&class->lock); @@ -1354,46 +1362,60 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) } EXPORT_SYMBOL_GPL(zs_malloc); -void zs_free(struct zs_pool *pool, unsigned long handle) +static void obj_free(struct zs_pool *pool, struct size_class *class, + unsigned long obj) { struct link_free *link; struct page *first_page, *f_page; - unsigned long obj, f_objidx, f_offset; + unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - struct size_class *class; enum fullness_group fullness; - if (unlikely(!handle)) - return; + BUG_ON(!obj); - obj = handle_to_obj(handle); - free_handle(pool, handle); obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); - class = pool->size_class[class_idx]; f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - spin_lock(&class->lock); + vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ - vaddr = kmap_atomic(f_page); link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; kunmap_atomic(vaddr); first_page->freelist = (void *)obj; - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - zs_stat_dec(class, OBJ_USED, 1); +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct page *first_page, *f_page; + unsigned long obj, f_objidx; + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!handle)) + return; + + obj = handle_to_obj(handle); + free_handle(pool, handle); + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(pool, class, obj); + fullness = fix_fullness_group(class, first_page); if (fullness == ZS_EMPTY) zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( class->size, class->pages_per_zspage)); - spin_unlock(&class->lock); if (fullness == ZS_EMPTY) { -- cgit v1.2.3 From 312fcae227037619dc858c9ccd362c7b847730a2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:30 -0700 Subject: zsmalloc: support compaction This patch provides core functions for migration of zsmalloc. Migraion policy is simple as follows. for each size class { while { src_page = get zs_page from ZS_ALMOST_EMPTY if (!src_page) break; dst_page = get zs_page from ZS_ALMOST_FULL if (!dst_page) dst_page = get zs_page from ZS_ALMOST_EMPTY if (!dst_page) break; migrate(from src_page, to dst_page); } } For migration, we need to identify which objects in zspage are allocated to migrate them out. We could know it by iterating of freed objects in a zspage because first_page of zspage keeps free objects singly-linked list but it's not efficient. Instead, this patch adds a tag(ie, OBJ_ALLOCATED_TAG) in header of each object(ie, handle) so we could check whether the object is allocated easily. This patch adds another status bit in handle to synchronize between user access through zs_map_object and migration. During migration, we cannot move objects user are using due to data coherency between old object and new object. [akpm@linux-foundation.org: zsmalloc.c needs sched.h for cond_resched()] Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 1 + mm/zsmalloc.c | 378 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 360 insertions(+), 19 deletions(-) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 3283c6a55425..1338190b5478 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -47,5 +47,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); +unsigned long zs_compact(struct zs_pool *pool); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 55b171016f4f..c4ae608dc725 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -78,6 +78,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,26 @@ #endif #endif #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) + +/* + * Memory for allocating for handle keeps object position by + * encoding and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define MAX(a, b) ((a) >= (b) ? (a) : (b)) @@ -610,35 +630,35 @@ static struct page *get_next_page(struct page *page) /* * Encode as a single handle value. - * On hardware platforms with physical memory starting at 0x0 the pfn - * could be 0 so we ensure that the handle will never be 0 by adjusting the - * encoded obj_idx value before encoding. + * We use the least bit of handle for tagging. */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +static void *location_to_obj(struct page *page, unsigned long obj_idx) { - unsigned long handle; + unsigned long obj; if (!page) { BUG_ON(obj_idx); return NULL; } - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; - return (void *)handle; + return (void *)obj; } /* * Decode pair from the given object handle. We adjust the * decoded obj_idx back to its original value since it was adjusted in - * obj_location_to_handle(). + * location_to_obj(). */ -static void obj_to_location(unsigned long handle, struct page **page, +static void obj_to_location(unsigned long obj, struct page **page, unsigned long *obj_idx) { - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); } static unsigned long handle_to_obj(unsigned long handle) @@ -646,6 +666,11 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } +unsigned long obj_to_head(void *obj) +{ + return *(unsigned long *)obj; +} + static unsigned long obj_idx_to_offset(struct page *page, unsigned long obj_idx, int class_size) { @@ -657,6 +682,25 @@ static unsigned long obj_idx_to_offset(struct page *page, return off + obj_idx * class_size; } +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + static void reset_page(struct page *page) { clear_bit(PG_private, &page->flags); @@ -718,7 +762,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i++); + link->next = location_to_obj(page, i++); link += class->size / sizeof(*link); } @@ -728,7 +772,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); + link->next = location_to_obj(next_page, 0); kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; @@ -782,7 +826,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) init_zspage(first_page, class); - first_page->freelist = obj_location_to_handle(first_page, 0); + first_page->freelist = location_to_obj(first_page, 0); /* Maximum number of objects we can store in this zspage */ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; @@ -1017,6 +1061,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } +static bool zspage_full(struct page *page) +{ + BUG_ON(!is_first_page(page)); + + return page->inuse == page->objects; +} + #ifdef CONFIG_ZSMALLOC_STAT static inline void zs_stat_inc(struct size_class *class, @@ -1219,6 +1270,9 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); + /* From now on, migration cannot move the object */ + pin_tag(handle); + obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); @@ -1276,6 +1330,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1289,6 +1344,7 @@ static unsigned long obj_malloc(struct page *first_page, unsigned long m_objidx, m_offset; void *vaddr; + handle |= OBJ_ALLOCATED_TAG; obj = (unsigned long)first_page->freelist; obj_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); @@ -1374,6 +1430,7 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, BUG_ON(!obj); + obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1402,8 +1459,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) if (unlikely(!handle)) return; + pin_tag(handle); obj = handle_to_obj(handle); - free_handle(pool, handle); obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1413,18 +1470,301 @@ void zs_free(struct zs_pool *pool, unsigned long handle) spin_lock(&class->lock); obj_free(pool, class, obj); fullness = fix_fullness_group(class, first_page); - if (fullness == ZS_EMPTY) + if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(unsigned long src, unsigned long dst, + struct size_class *class) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + if (s_off + size >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } else { + s_off += size; + s_size -= size; + } + + if (d_off + size >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } else { + d_off += size; + d_size -= size; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct page *page, int index, + struct size_class *class) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; + /* how many of objects are migrated */ + int nr_migrated; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int nr_migrated = 0; + int ret = 0; + + while (1) { + handle = find_alloced_obj(s_page, index, class); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(d_page, class, handle); + zs_object_copy(used_obj, free_obj, class); + index++; + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(pool, class, used_obj); + nr_migrated++; + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + cc->nr_migrated = nr_migrated; + + return ret; +} + +static struct page *alloc_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(page, class, i); + break; + } + } + + return page; +} + +static void putback_zspage(struct zs_pool *pool, struct size_class *class, + struct page *first_page) +{ + int class_idx; + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + get_zspage_mapping(first_page, &class_idx, &fullness); + insert_zspage(first_page, class, fullness); + fullness = fix_fullness_group(class, first_page); if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); + free_zspage(first_page); } } -EXPORT_SYMBOL_GPL(zs_free); + +static struct page *isolate_source_page(struct size_class *class) +{ + struct page *page; + + page = class->fullness_list[ZS_ALMOST_EMPTY]; + if (page) + remove_zspage(page, class, ZS_ALMOST_EMPTY); + + return page; +} + +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) +{ + int nr_to_migrate; + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + unsigned long nr_total_migrated = 0; + + cond_resched(); + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + /* The goal is to migrate all live objects in source page */ + nr_to_migrate = src_page->inuse; + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = alloc_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, try to + * allocate another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + nr_total_migrated += cc.nr_migrated; + nr_to_migrate -= cc.nr_migrated; + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + putback_zspage(pool, class, src_page); + spin_unlock(&class->lock); + nr_total_migrated += cc.nr_migrated; + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); + + return nr_total_migrated; +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + unsigned long nr_migrated = 0; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + nr_migrated += __zs_compact(pool, class); + } + + synchronize_rcu(); + + return nr_migrated; +} +EXPORT_SYMBOL_GPL(zs_compact); /** * zs_create_pool - Creates an allocation pool to work from. -- cgit v1.2.3 From d3d07c92ff69f784bb8c3279fa87678bfa2f7f6f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:33 -0700 Subject: zsmalloc: adjust ZS_ALMOST_FULL Curretly, zsmalloc regards a zspage as ZS_ALMOST_EMPTY if the zspage has under 1/4 used objects(ie, fullness_threshold_frac). It could make result in loose packing since zsmalloc migrates only ZS_ALMOST_EMPTY zspage out. This patch changes the rule so that zsmalloc makes zspage which has above 3/4 used object ZS_ALMOST_FULL so it could make tight packing. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c4ae608dc725..3bd8b0a16462 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -480,7 +480,7 @@ static enum fullness_group get_fullness_group(struct page *page) fg = ZS_EMPTY; else if (inuse == max_objects) fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) + else if (inuse <= 3 * max_objects / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; -- cgit v1.2.3 From 4e3ba87845420e0bfa21e6c4f7f81897aed38f8c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:36 -0700 Subject: zram: support compaction Now that zsmalloc supports compaction, zram can use it. For the first step, this patch exports compact knob via sysfs so user can do compaction via "echo 1 > /sys/block/zram0/compact". Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 15 +++++++++++++++ drivers/block/zram/zram_drv.c | 25 +++++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 3 files changed, 41 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index a6148eaf91e5..bede9028a5a0 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -141,3 +141,18 @@ Description: amount of memory ZRAM can use to store the compressed data. The limit could be changed in run time and "0" means disable the limit. No limit is the initial state. Unit: bytes + +What: /sys/block/zram/compact +Date: August 2015 +Contact: Minchan Kim +Description: + The compact file is write-only and trigger compaction for + allocator zrm uses. The allocator moves some objects so that + it could free fragment space. + +What: /sys/block/zram/num_migrated +Date: August 2015 +Contact: Minchan Kim +Description: + The compact file is read-only and shows how many object + migrated by compaction. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 871bd3550cb0..1626961fdb2f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -63,6 +63,27 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_migrated; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + meta = zram->meta; + nr_migrated = zs_compact(meta->mem_pool); + atomic64_add(nr_migrated, &zram->stats.num_migrated); + up_read(&zram->init_lock); + + return len; +} + static ssize_t disksize_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1017,6 +1038,7 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; +static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); @@ -1035,6 +1057,7 @@ ZRAM_ATTR_RO(invalid_io); ZRAM_ATTR_RO(notify_free); ZRAM_ATTR_RO(zero_pages); ZRAM_ATTR_RO(compr_data_size); +ZRAM_ATTR_RO(num_migrated); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1044,6 +1067,8 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_num_writes.attr, &dev_attr_failed_reads.attr, &dev_attr_failed_writes.attr, + &dev_attr_num_migrated.attr, + &dev_attr_compact.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 17056e589146..570c598f4ce9 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -84,6 +84,7 @@ struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ + atomic64_t num_migrated; /* no. of migrated object */ atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ -- cgit v1.2.3 From 7b60a68529b0d827d26ea3426c2addd071bff789 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:39 -0700 Subject: zsmalloc: record handle in page->private for huge object We store handle on header of each allocated object so it increases the size of each object by sizeof(unsigned long). If zram stores 4096 bytes to zsmalloc(ie, bad compression), zsmalloc needs 4104B-class to add handle. However, 4104B-class has 1-pages_per_zspage so wasted size by internal fragment is 8192 - 4104, which is terrible. So this patch records the handle in page->private on such huge object(ie, pages_per_zspage == 1 && maxobj_per_zspage == 1) instead of header of each object so we could use 4096B-class, not 4104B-class. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 3bd8b0a16462..95771c75f2e9 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -57,6 +57,8 @@ * * page->private (union with page->first_page): refers to the * component page after the first page + * If the page is first_page for huge object, it stores handle. + * Look at size_class->huge. * page->freelist: points to the first free object in zspage. * Free objects are linked together using in-place * metadata. @@ -163,7 +165,7 @@ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) /* each chunk includes extra space to keep handle */ -#define ZS_MAX_ALLOC_SIZE (PAGE_SIZE + ZS_HANDLE_SIZE) +#define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* * On systems with 4K page size, this gives 255 size classes! There is a @@ -239,6 +241,8 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; #ifdef CONFIG_ZSMALLOC_STAT struct zs_size_stat stats; @@ -300,6 +304,7 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ + bool huge; }; static int create_handle_cache(struct zs_pool *pool) @@ -457,7 +462,7 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return idx; + return min(zs_size_classes - 1, idx); } /* @@ -666,9 +671,14 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -unsigned long obj_to_head(void *obj) +static unsigned long obj_to_head(struct size_class *class, struct page *page, + void *obj) { - return *(unsigned long *)obj; + if (class->huge) { + VM_BUG_ON(!is_first_page(page)); + return *(unsigned long *)page_private(page); + } else + return *(unsigned long *)obj; } static unsigned long obj_idx_to_offset(struct page *page, @@ -954,9 +964,12 @@ static void __zs_unmap_object(struct mapping_area *area, if (area->vm_mm == ZS_MM_RO) goto out; - buf = area->vm_buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; + buf = area->vm_buf; + if (!area->huge) { + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + } sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -1295,7 +1308,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - return ret + ZS_HANDLE_SIZE; + if (!class->huge) + ret += ZS_HANDLE_SIZE; + + return ret; } EXPORT_SYMBOL_GPL(zs_map_object); @@ -1352,8 +1368,12 @@ static unsigned long obj_malloc(struct page *first_page, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); first_page->freelist = link->next; - /* record handle in the header of allocated chunk */ - link->handle = handle; + if (!class->huge) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle in first_page->private */ + set_page_private(first_page, handle); kunmap_atomic(vaddr); first_page->inuse++; zs_stat_inc(class, OBJ_USED, 1); @@ -1377,7 +1397,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) struct size_class *class; struct page *first_page; - if (unlikely(!size || (size + ZS_HANDLE_SIZE) > ZS_MAX_ALLOC_SIZE)) + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; handle = alloc_handle(pool); @@ -1387,6 +1407,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; + /* In huge class size, we store the handle into first_page->private */ + if (class->huge) { + size -= ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + } spin_lock(&class->lock); first_page = find_get_zspage(class); @@ -1442,6 +1467,8 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; + if (class->huge) + set_page_private(first_page, 0); kunmap_atomic(vaddr); first_page->freelist = (void *)obj; first_page->inuse--; @@ -1567,7 +1594,7 @@ static unsigned long find_alloced_obj(struct page *page, int index, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(addr + offset); + head = obj_to_head(class, page, addr + offset); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; if (trypin_tag(handle)) @@ -1837,6 +1864,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; + if (pages_per_zspage == 1 && + get_maxobj_per_zspage(size, pages_per_zspage) == 1) + class->huge = true; spin_lock_init(&class->lock); pool->size_class[i] = class; -- cgit v1.2.3 From 248ca1b053c82fa22427d22b33ac51a24c88a86d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:42 -0700 Subject: zsmalloc: add fullness into stat During investigating compaction, fullness information of each class is helpful for investigating how the compaction works well. With that, we could know how compaction works well more clear on each size class. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 349 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 184 insertions(+), 165 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 95771c75f2e9..461243e14d3e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -197,6 +197,8 @@ enum fullness_group { enum zs_stat_type { OBJ_ALLOCATED, OBJ_USED, + CLASS_ALMOST_FULL, + CLASS_ALMOST_EMPTY, NR_ZS_STAT_TYPE, }; @@ -412,6 +414,11 @@ static struct zpool_driver zs_zpool_driver = { MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -465,6 +472,179 @@ static int get_size_class_index(int size) return min(zs_size_classes - 1, idx); } +#ifdef CONFIG_ZSMALLOC_STAT + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return class->stats.objs[type]; +} + +static int __init zs_stat_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + return -ENOMEM; + + return 0; +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + struct dentry *entry; + + if (!zs_stat_root) + return -ENODEV; + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return -ENOMEM; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "classes"); + return -ENOMEM; + } + + return 0; +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return 0; +} + +static int __init zs_stat_init(void) +{ + return 0; +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + return 0; +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} + +#endif + + /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -514,6 +694,8 @@ static void insert_zspage(struct page *page, struct size_class *class, list_add_tail(&page->lru, &(*head)->lru); *head = page; + zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -539,6 +721,8 @@ static void remove_zspage(struct page *page, struct size_class *class, struct page, lru); list_del_init(&page->lru); + zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -1057,11 +1241,6 @@ static void init_zs_size_classes(void) zs_size_classes = nr; } -static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) -{ - return pages_per_zspage * PAGE_SIZE / size; -} - static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) { if (prev->pages_per_zspage != pages_per_zspage) @@ -1081,166 +1260,6 @@ static bool zspage_full(struct page *page) return page->inuse == page->objects; } -#ifdef CONFIG_ZSMALLOC_STAT - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ - class->stats.objs[type] += cnt; -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ - class->stats.objs[type] -= cnt; -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return class->stats.objs[type]; -} - -static int __init zs_stat_init(void) -{ - if (!debugfs_initialized()) - return -ENODEV; - - zs_stat_root = debugfs_create_dir("zsmalloc", NULL); - if (!zs_stat_root) - return -ENOMEM; - - return 0; -} - -static void __exit zs_stat_exit(void) -{ - debugfs_remove_recursive(zs_stat_root); -} - -static int zs_stats_size_show(struct seq_file *s, void *v) -{ - int i; - struct zs_pool *pool = s->private; - struct size_class *class; - int objs_per_zspage; - unsigned long obj_allocated, obj_used, pages_used; - unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; - - seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", - "obj_allocated", "obj_used", "pages_used"); - - for (i = 0; i < zs_size_classes; i++) { - class = pool->size_class[i]; - - if (class->index != i) - continue; - - spin_lock(&class->lock); - obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); - obj_used = zs_stat_get(class, OBJ_USED); - spin_unlock(&class->lock); - - objs_per_zspage = get_maxobj_per_zspage(class->size, - class->pages_per_zspage); - pages_used = obj_allocated / objs_per_zspage * - class->pages_per_zspage; - - seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, - class->size, obj_allocated, obj_used, pages_used); - - total_objs += obj_allocated; - total_used_objs += obj_used; - total_pages += pages_used; - } - - seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", - total_objs, total_used_objs, total_pages); - - return 0; -} - -static int zs_stats_size_open(struct inode *inode, struct file *file) -{ - return single_open(file, zs_stats_size_show, inode->i_private); -} - -static const struct file_operations zs_stat_size_ops = { - .open = zs_stats_size_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int zs_pool_stat_create(char *name, struct zs_pool *pool) -{ - struct dentry *entry; - - if (!zs_stat_root) - return -ENODEV; - - entry = debugfs_create_dir(name, zs_stat_root); - if (!entry) { - pr_warn("debugfs dir <%s> creation failed\n", name); - return -ENOMEM; - } - pool->stat_dentry = entry; - - entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stat_size_ops); - if (!entry) { - pr_warn("%s: debugfs file entry <%s> creation failed\n", - name, "obj_in_classes"); - return -ENOMEM; - } - - return 0; -} - -static void zs_pool_stat_destroy(struct zs_pool *pool) -{ - debugfs_remove_recursive(pool->stat_dentry); -} - -#else /* CONFIG_ZSMALLOC_STAT */ - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return 0; -} - -static int __init zs_stat_init(void) -{ - return 0; -} - -static void __exit zs_stat_exit(void) -{ -} - -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) -{ - return 0; -} - -static inline void zs_pool_stat_destroy(struct zs_pool *pool) -{ -} - -#endif - unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); -- cgit v1.2.3 From d02be50dba649b4246e0c1c4b7cb5d8a8d49de9a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:46 -0700 Subject: zsmalloc: zsmalloc documentation Create zsmalloc doc which explains design concept and stat information. Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/zsmalloc.txt | 70 +++++++++++++++++++++++++++++++++++++++++++ MAINTAINERS | 1 + mm/zsmalloc.c | 29 ------------------ 3 files changed, 71 insertions(+), 29 deletions(-) create mode 100644 Documentation/vm/zsmalloc.txt diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt new file mode 100644 index 000000000000..64ed63c4f69d --- /dev/null +++ b/Documentation/vm/zsmalloc.txt @@ -0,0 +1,70 @@ +zsmalloc +-------- + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +---- + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +/sys/kernel/debug/zsmalloc/. Here is a sample of stat output: + +# cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + .. + .. + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + .. + .. + + +class: index +size: object size zspage stores +almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full: the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated: the number of objects allocated +obj_used: the number of objects allocated to the user +pages_used: the number of pages allocated for the class +pages_per_zspage: the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when: + n <= N / f, where +n = number of allocated objects +N = total number of objects zspage can store +f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + ZS_ALMOST_FULL when n > N / f + ZS_EMPTY when n == 0 + ZS_FULL when n == N diff --git a/MAINTAINERS b/MAINTAINERS index 6ee1e79ea16b..190981382853 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10972,6 +10972,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/zsmalloc.c F: include/linux/zsmalloc.h +F: Documentation/vm/zsmalloc.txt ZSWAP COMPRESSED SWAP CACHING M: Seth Jennings diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 461243e14d3e..1833fc9e09cb 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -12,35 +12,6 @@ */ /* - * This allocator is designed for use with zram. Thus, the allocator is - * supposed to work well under low memory conditions. In particular, it - * never attempts higher order page allocation which is very likely to - * fail under memory pressure. On the other hand, if we just use single - * (0-order) pages, it would suffer from very high fragmentation -- - * any object of size PAGE_SIZE/2 or larger would occupy an entire page. - * This was one of the major issues with its predecessor (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE - * since this satisfies the requirements of all its current users (in the - * worst case, page is incompressible and is thus stored "as-is" i.e. in - * uncompressed form). For allocation requests larger than this size, failure - * is returned (see zs_malloc). - * - * Additionally, zs_malloc() does not return a dereferenceable pointer. - * Instead, it returns an opaque handle (unsigned long) which encodes actual - * location of the allocated object. The reason for this indirection is that - * zsmalloc does not keep zspages permanently mapped since that would cause - * issues on 32-bit systems where the VA region for kernel space mappings - * is very small. So, before using the allocating memory, the object has to - * be mapped using zs_map_object() to get a usable pointer and subsequently - * unmapped using zs_unmap_object(). - * * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * -- cgit v1.2.3 From 888fa374e625f3ee8f3cc2efebf52939d3bb6da1 Mon Sep 17 00:00:00 2001 From: Yinghao Xie Date: Wed, 15 Apr 2015 16:15:49 -0700 Subject: mm/zsmalloc.c: fix comment for get_pages_per_zspage Signed-off-by: Yinghao Xie Suggested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1833fc9e09cb..7a48e5568d46 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -731,7 +731,8 @@ out: * to form a zspage for each size class. This is important * to reduce wastage due to unusable space left at end of * each zspage which is given as: - * wastage = Zp - Zp % size_class + * wastage = Zp % class_size + * usage = Zp - wastage * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... * * For example, for size class of 3/8 * PAGE_SIZE, we should -- cgit v1.2.3 From 10447b60bee52f026bdbc5fe2aca52d0492fc91d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:15:52 -0700 Subject: zram: remove `num_migrated' device attr This patch introduces rework to zram stats. We have per-stat sysfs nodes, and it makes things a bit hard to use in user space: it doesn't give an immediate stats 'snapshot', it requires user space to use more syscalls - open, read, close for every stat file, with appropriate error checks on every step, etc. First, zram now accounts block layer statistics, available in /sys/block/zram/stat and /proc/diskstats files. So some new stats are available (see Documentation/block/stat.txt), besides, zram's activities now can be monitored by sysstat's iostat or similar tools. Example: cat /sys/block/zram0/stat 248 0 1984 0 251029 0 2008232 5120 0 5116 5116 Second, group currently exported on per-stat basis nodes into two categories (files): -- zram/io_stat accumulates device's IO stats, that are not accounted by block layer, and contains: failed_reads failed_writes invalid_io notify_free Example: cat /sys/block/zram0/io_stat 0 0 0 652572 -- zram/mm_stat accumulates zram mm stats and contains: orig_data_size compr_data_size mem_used_total mem_limit mem_used_max zero_pages num_migrated Example: cat /sys/block/zram0/mm_stat 434634752 270288572 279158784 0 579895296 15060 0 per-stat sysfs nodes are now considered to be deprecated and we plan to remove them (and clean up some of the existing stat code) in two years (as of now, there is no warning printed to syslog about deprecated stats being used). User space is advised to use the above mentioned 3 files. This patch (of 7): Remove sysfs `num_migrated' attribute. We are moving away from per-stat device attrs towards 3 stat files that will accumulate io and mm stats in a format similar to block layer statistics in /sys/block//stat. That will be easier to use in user space, and reduce the number of syscalls needed to read zram device statistics. `num_migrated' will return back in zram/mm_stat file. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 7 ------- drivers/block/zram/zram_drv.c | 2 -- 2 files changed, 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index bede9028a5a0..91ad7071b9e8 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -149,10 +149,3 @@ Description: The compact file is write-only and trigger compaction for allocator zrm uses. The allocator moves some objects so that it could free fragment space. - -What: /sys/block/zram/num_migrated -Date: August 2015 -Contact: Minchan Kim -Description: - The compact file is read-only and shows how many object - migrated by compaction. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1626961fdb2f..f416e3ce6392 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1057,7 +1057,6 @@ ZRAM_ATTR_RO(invalid_io); ZRAM_ATTR_RO(notify_free); ZRAM_ATTR_RO(zero_pages); ZRAM_ATTR_RO(compr_data_size); -ZRAM_ATTR_RO(num_migrated); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1067,7 +1066,6 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_num_writes.attr, &dev_attr_failed_reads.attr, &dev_attr_failed_writes.attr, - &dev_attr_num_migrated.attr, &dev_attr_compact.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, -- cgit v1.2.3 From c72c6160d967ed26a0b136dbab337f821d233509 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:15:55 -0700 Subject: zram: move compact_store() to sysfs functions area A cosmetic change. We have a new code layout and keep zram per-device sysfs store and show functions in one place. Move compact_store() to that handlers block to conform to current layout. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f416e3ce6392..871bd3550cb0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -63,27 +63,6 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - unsigned long nr_migrated; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; - - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } - - meta = zram->meta; - nr_migrated = zs_compact(meta->mem_pool); - atomic64_add(nr_migrated, &zram->stats.num_migrated); - up_read(&zram->init_lock); - - return len; -} - static ssize_t disksize_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1038,7 +1017,6 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; -static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); @@ -1066,7 +1044,6 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_num_writes.attr, &dev_attr_failed_reads.attr, &dev_attr_failed_writes.attr, - &dev_attr_compact.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, -- cgit v1.2.3 From 8811a9421b325b06a2456ae1b8fe23e838cbfe33 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:15:57 -0700 Subject: zram: use generic start/end io accounting Use bio generic_start_io_acct() and generic_end_io_acct() to account device's block layer statistics. This will let users to monitor zram activities using sysstat and similar packages/tools. Apart from the usual per-stat sysfs attr, zram IO stats are now also available in '/sys/block/zram/stat' and '/proc/diskstats' files. We will slowly get rid of per-stat sysfs files. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 871bd3550cb0..35dafd9a0350 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -670,8 +670,12 @@ out: static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, int rw) { + unsigned long start_time = jiffies; int ret; + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); + if (rw == READ) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); @@ -680,6 +684,8 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } + generic_end_io_acct(rw, &zram->disk->part0, start_time); + if (unlikely(ret)) { if (rw == READ) atomic64_inc(&zram->stats.failed_reads); -- cgit v1.2.3 From 77ba015f9d5c584226a634753e9b318cb272cd41 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:00 -0700 Subject: zram: describe device attrs in documentation Briefly describe exported device stat attrs in zram documentation. We will eventually get rid of per-stat sysfs nodes and, thus, clean up Documentation/ABI/testing/sysfs-block-zram file, which is the only source of information about device sysfs nodes. Add `num_migrated' description, since there is no independent `num_migrated' sysfs node (and no corresponding sysfs-block-zram entry), it will be exported via zram/mm_stat file. At this point we can provide minimal description, because sysfs-block-zram still contains detailed information. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 48 +++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 7fcf9c6592ec..971765ed5ac1 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -98,20 +98,40 @@ size of the disk when not in use so a huge zram is wasteful. mount /dev/zram1 /tmp 7) Stats: - Per-device statistics are exported as various nodes under - /sys/block/zram/ - disksize - num_reads - num_writes - failed_reads - failed_writes - invalid_io - notify_free - zero_pages - orig_data_size - compr_data_size - mem_used_total - mem_used_max +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attritbutes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +Name access description +---- ------ ----------- +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +num_reads RO the number of reads +failed_reads RO the number of failed reads +num_write RO the number of writes +failed_writes RO the number of failed writes +invalid_io RO the number of non-page-size-aligned I/O requests +max_comp_streams RW the number of possible concurrent compress operations +comp_algorithm RW show and change the compression algorithm +notify_free RO the number of notifications to free pages (either + slot free notifications or REQ_DISCARD requests) +zero_pages RO the number of zero filled pages written to this disk +orig_data_size RO uncompressed size of data stored in this disk +compr_data_size RO compressed size of data stored in this disk +mem_used_total RO the amount of memory allocated for this disk +mem_used_max RW the maximum amount memory zram have consumed to + store compressed data +mem_limit RW the maximum amount of memory ZRAM can use to store + the compressed data +num_migrated RO the number of objects migrated migrated by compaction + + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.txt for +details. 8) Deactivate: swapoff /dev/zram0 -- cgit v1.2.3 From 2f6a3bed7347ee94fe57b3501fddaa646a26d890 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:03 -0700 Subject: zram: export new 'io_stat' sysfs attrs Per-device `zram/io_stat' file provides accumulated I/O statistics of particular zram device in a format similar to block layer statistics. The file consists of a single line and represents the following stats (separated by whitespace): failed_reads failed_writes invalid_io notify_free Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 9 +++++++++ Documentation/blockdev/zram.txt | 11 +++++++++++ drivers/block/zram/zram_drv.c | 20 ++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 91ad7071b9e8..a7f622f9bcf6 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -149,3 +149,12 @@ Description: The compact file is write-only and trigger compaction for allocator zrm uses. The allocator moves some objects so that it could free fragment space. + +What: /sys/block/zram/io_stat +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The io_stat file is read-only and accumulates device's I/O + statistics not accounted by block layer. For example, + failed_reads, failed_writes, etc. File format is similar to + block layer statistics file format. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 971765ed5ac1..9610be3e9d36 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -133,6 +133,17 @@ File /sys/block/zram/stat Represents block layer statistics. Read Documentation/block/stat.txt for details. +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + failed_reads + failed_writes + invalid_io + notify_free + 8) Deactivate: swapoff /dev/zram0 umount /dev/zram1 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 35dafd9a0350..1b63a717bdbb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1033,6 +1033,25 @@ static DEVICE_ATTR_RW(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; +} + +static DEVICE_ATTR_RO(io_stat); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); ZRAM_ATTR_RO(failed_reads); @@ -1060,6 +1079,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, + &dev_attr_io_stat.attr, NULL, }; -- cgit v1.2.3 From 4f2109f60881585dc04fa0b5657a60556576625c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:06 -0700 Subject: zram: export new 'mm_stat' sysfs attrs Per-device `zram/mm_stat' file provides mm statistics of a particular zram device in a format similar to block layer statistics. The file consists of a single line and represents the following stats (separated by whitespace): orig_data_size compr_data_size mem_used_total mem_limit mem_used_max zero_pages num_migrated Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 8 ++++++++ Documentation/blockdev/zram.txt | 12 ++++++++++++ drivers/block/zram/zram_drv.c | 31 ++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index a7f622f9bcf6..2e69e83bf510 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -158,3 +158,11 @@ Description: statistics not accounted by block layer. For example, failed_reads, failed_writes, etc. File format is similar to block layer statistics file format. + +What: /sys/block/zram/mm_stat +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The mm_stat file is read-only and represents device's mm + statistics (orig_data_size, compr_data_size, etc.) in a format + similar to block layer statistics file format. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 9610be3e9d36..7920f4026d36 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -144,6 +144,18 @@ whitespace: invalid_io notify_free +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + orig_data_size + compr_data_size + mem_used_total + mem_limit + mem_used_max + zero_pages + num_migrated + 8) Deactivate: swapoff /dev/zram0 umount /dev/zram1 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1b63a717bdbb..c94a1a98e301 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1051,7 +1051,37 @@ static ssize_t io_stat_show(struct device *dev, return ret; } +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; + + down_read(&zram->init_lock); + if (init_done(zram)) + mem_used = zs_get_total_pages(zram->meta->mem_pool); + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.num_migrated)); + up_read(&zram->init_lock); + + return ret; +} + static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); ZRAM_ATTR_RO(failed_reads); @@ -1080,6 +1110,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, &dev_attr_io_stat.attr, + &dev_attr_mm_stat.attr, NULL, }; -- cgit v1.2.3 From 8f7d282c717acaae25245c61b6b60e8995ec4ef4 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:09 -0700 Subject: zram: deprecate zram attrs sysfs nodes Add Documentation/ABI/obsolete/sysfs-block-zram file and list obsolete and deprecated attributes there. The patch also adds additional information to zram documentation and describes the basic strategy: - the existing RW nodes will be downgraded to WO nodes (in 4.11) - deprecated RO sysfs nodes will eventually be removed (in 4.11) Users will be additionally notified about deprecated attr usage by pr_warn_once() (added to every deprecated attr _show()), as suggested by Minchan Kim. User space is advised to use zram/stat, zram/io_stat and zram/mm_stat files. Signed-off-by: Sergey Senozhatsky Reported-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/obsolete/sysfs-block-zram | 119 ++++++++++++++++++++++++++++ Documentation/blockdev/zram.txt | 16 ++++ drivers/block/zram/zram_drv.c | 15 ++++ 3 files changed, 150 insertions(+) create mode 100644 Documentation/ABI/obsolete/sysfs-block-zram diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram new file mode 100644 index 000000000000..720ea92cfb2e --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-block-zram @@ -0,0 +1,119 @@ +What: /sys/block/zram/num_reads +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The num_reads file is read-only and specifies the number of + reads (failed or successful) done on this device. + Now accessible via zram/stat node. + +What: /sys/block/zram/num_writes +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The num_writes file is read-only and specifies the number of + writes (failed or successful) done on this device. + Now accessible via zram/stat node. + +What: /sys/block/zram/invalid_io +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The invalid_io file is read-only and specifies the number of + non-page-size-aligned I/O requests issued to this device. + Now accessible via zram/io_stat node. + +What: /sys/block/zram/failed_reads +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The failed_reads file is read-only and specifies the number of + failed reads happened on this device. + Now accessible via zram/io_stat node. + +What: /sys/block/zram/failed_writes +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The failed_writes file is read-only and specifies the number of + failed writes happened on this device. + Now accessible via zram/io_stat node. + +What: /sys/block/zram/notify_free +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The notify_free file is read-only. Depending on device usage + scenario it may account a) the number of pages freed because + of swap slot free notifications or b) the number of pages freed + because of REQ_DISCARD requests sent by bio. The former ones + are sent to a swap block device when a swap slot is freed, which + implies that this disk is being used as a swap disk. The latter + ones are sent by filesystem mounted with discard option, + whenever some data blocks are getting discarded. + Now accessible via zram/io_stat node. + +What: /sys/block/zram/zero_pages +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The zero_pages file is read-only and specifies number of zero + filled pages written to this disk. No memory is allocated for + such pages. + Now accessible via zram/mm_stat node. + +What: /sys/block/zram/orig_data_size +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The orig_data_size file is read-only and specifies uncompressed + size of data stored in this disk. This excludes zero-filled + pages (zero_pages) since no memory is allocated for them. + Unit: bytes + Now accessible via zram/mm_stat node. + +What: /sys/block/zram/compr_data_size +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The compr_data_size file is read-only and specifies compressed + size of data stored in this disk. So, compression ratio can be + calculated using orig_data_size and this statistic. + Unit: bytes + Now accessible via zram/mm_stat node. + +What: /sys/block/zram/mem_used_total +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The mem_used_total file is read-only and specifies the amount + of memory, including allocator fragmentation and metadata + overhead, allocated for this disk. So, allocator space + efficiency can be calculated using compr_data_size and this + statistic. + Unit: bytes + Now accessible via zram/mm_stat node. + +What: /sys/block/zram/mem_used_max +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The mem_used_max file is read/write and specifies the amount + of maximum memory zram have consumed to store compressed data. + For resetting the value, you should write "0". Otherwise, + you could see -EINVAL. + Unit: bytes + Downgraded to write-only node: so it's possible to set new + value only; its current value is stored in zram/mm_stat + node. + +What: /sys/block/zram/mem_limit +Date: August 2015 +Contact: Sergey Senozhatsky +Description: + The mem_limit file is read/write and specifies the maximum + amount of memory ZRAM can use to store the compressed data. + The limit could be changed in run time and "0" means disable + the limit. No limit is the initial state. Unit: bytes + Downgraded to write-only node: so it's possible to set new + value only; its current value is stored in zram/mm_stat + node. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 7920f4026d36..48a183e29988 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -128,6 +128,22 @@ mem_limit RW the maximum amount of memory ZRAM can use to store num_migrated RO the number of objects migrated migrated by compaction +WARNING +======= +per-stat sysfs attributes are considered to be deprecated. +The basic strategy is: +-- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) +-- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) + +The list of deprecated attributes can be found here: +Documentation/ABI/obsolete/sysfs-block-zram + +Basically, every attribute that has its own read accessible sysfs node +(e.g. num_reads) *AND* is accessible via one of the stat files (zram/stat +or zram/io_stat or zram/mm_stat) is considered to be deprecated. + +User space is advised to use the following files to read the device statistics. + File /sys/block/zram/stat Represents block layer statistics. Read Documentation/block/stat.txt for diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c94a1a98e301..4491787095a0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -43,11 +43,22 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static inline void deprecated_attr_warn(const char *name) +{ + pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", + task_pid_nr(current), + current->comm, + name, + "See zram documentation."); +} + #define ZRAM_ATTR_RO(name) \ static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ + \ + deprecated_attr_warn(__stringify(name)); \ return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ @@ -89,6 +100,7 @@ static ssize_t orig_data_size_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); + deprecated_attr_warn("orig_data_size"); return scnprintf(buf, PAGE_SIZE, "%llu\n", (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } @@ -99,6 +111,7 @@ static ssize_t mem_used_total_show(struct device *dev, u64 val = 0; struct zram *zram = dev_to_zram(dev); + deprecated_attr_warn("mem_used_total"); down_read(&zram->init_lock); if (init_done(zram)) { struct zram_meta *meta = zram->meta; @@ -128,6 +141,7 @@ static ssize_t mem_limit_show(struct device *dev, u64 val; struct zram *zram = dev_to_zram(dev); + deprecated_attr_warn("mem_limit"); down_read(&zram->init_lock); val = zram->limit_pages; up_read(&zram->init_lock); @@ -159,6 +173,7 @@ static ssize_t mem_used_max_show(struct device *dev, u64 val = 0; struct zram *zram = dev_to_zram(dev); + deprecated_attr_warn("mem_used_max"); down_read(&zram->init_lock); if (init_done(zram)) val = atomic_long_read(&zram->stats.max_used_pages); -- cgit v1.2.3 From 1ec7cfb13acb8047ae5baafb43d2cd6b64ac85b9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:12 -0700 Subject: zsmalloc: remove synchronize_rcu from zs_compact() Do not synchronize rcu in zs_compact(). Neither zsmalloc not zram use rcu. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7a48e5568d46..8705a010e2d3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1778,8 +1778,6 @@ unsigned long zs_compact(struct zs_pool *pool) nr_migrated += __zs_compact(pool, class); } - synchronize_rcu(); - return nr_migrated; } EXPORT_SYMBOL_GPL(zs_compact); -- cgit v1.2.3 From 495819ead5ad02174208994ca610852a7791a2f2 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:15 -0700 Subject: zsmalloc: micro-optimize zs_object_copy() A micro-optimization. Avoid additional branching and reduce (a bit) registry pressure (f.e. s_off += size; d_off += size; may be calculated twise: first for >= PAGE_SIZE check and later for offset update in "else" clause). scripts/bloat-o-meter shows some improvement add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-10 (-10) function old new delta zs_object_copy 550 540 -10 Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8705a010e2d3..a9a9ff233a13 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1537,7 +1537,12 @@ static void zs_object_copy(unsigned long src, unsigned long dst, if (written == class->size) break; - if (s_off + size >= PAGE_SIZE) { + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); @@ -1546,21 +1551,15 @@ static void zs_object_copy(unsigned long src, unsigned long dst, d_addr = kmap_atomic(d_page); s_size = class->size - written; s_off = 0; - } else { - s_off += size; - s_size -= size; } - if (d_off + size >= PAGE_SIZE) { + if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); BUG_ON(!d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; - } else { - d_off += size; - d_size -= size; } } -- cgit v1.2.3 From 839373e645d12613308d9148041c4bd967bce8d5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:16:18 -0700 Subject: zsmalloc: remove unnecessary insertion/removal of zspage in compaction In putback_zspage, we don't need to insert a zspage into list of zspage in size_class again to just fix fullness group. We could do directly without reinsertion so we could save some instuctions. Reported-by: Heesub Shin Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Dan Streetman Cc: Seth Jennings Cc: Ganesh Mahendran Cc: Luigi Semenzato Cc: Gunho Lee Cc: Juneho Choi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a9a9ff233a13..ded3672295d7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1678,14 +1678,14 @@ static struct page *alloc_target_page(struct size_class *class) static void putback_zspage(struct zs_pool *pool, struct size_class *class, struct page *first_page) { - int class_idx; enum fullness_group fullness; BUG_ON(!is_first_page(first_page)); - get_zspage_mapping(first_page, &class_idx, &fullness); + fullness = get_fullness_group(first_page); insert_zspage(first_page, class, fullness); - fullness = fix_fullness_group(class, first_page); + set_zspage_mapping(first_page, class->index, fullness); + if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( class->size, class->pages_per_zspage)); -- cgit v1.2.3 From 81da9b13f73653bf5f38c63af8029fc459198ac0 Mon Sep 17 00:00:00 2001 From: Heesub Shin Date: Wed, 15 Apr 2015 16:16:21 -0700 Subject: zsmalloc: fix fatal corruption due to wrong size class selection There is no point in overriding the size class below. It causes fatal corruption on the next chunk on the 3264-bytes size class, which is the last size class that is not huge. For example, if the requested size was exactly 3264 bytes, current zsmalloc allocates and returns a chunk from the size class of 3264 bytes, not 4096. User access to this chunk may overwrite head of the next adjacent chunk. Here is the panic log captured when freelist was corrupted due to this: Kernel BUG at ffffffc00030659c [verbose debug info unavailable] Internal error: Oops - BUG: 96000006 [#1] PREEMPT SMP Modules linked in: exynos-snapshot: core register saved(CPU:5) CPUMERRSR: 0000000000000000, L2MERRSR: 0000000000000000 exynos-snapshot: context saved(CPU:5) exynos-snapshot: item - log_kevents is disabled CPU: 5 PID: 898 Comm: kswapd0 Not tainted 3.10.61-4497415-eng #1 task: ffffffc0b8783d80 ti: ffffffc0b71e8000 task.ti: ffffffc0b71e8000 PC is at obj_idx_to_offset+0x0/0x1c LR is at obj_malloc+0x44/0xe8 pc : [] lr : [] pstate: a0000045 sp : ffffffc0b71eb790 x29: ffffffc0b71eb790 x28: ffffffc00204c000 x27: 000000000001d96f x26: 0000000000000000 x25: ffffffc098cc3500 x24: ffffffc0a13f2810 x23: ffffffc098cc3501 x22: ffffffc0a13f2800 x21: 000011e1a02006e3 x20: ffffffc0a13f2800 x19: ffffffbc02a7e000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000feb x15: 0000000000000000 x14: 00000000a01003e3 x13: 0000000000000020 x12: fffffffffffffff0 x11: ffffffc08b264000 x10: 00000000e3a01004 x9 : ffffffc08b263fea x8 : ffffffc0b1e611c0 x7 : ffffffc000307d24 x6 : 0000000000000000 x5 : 0000000000000038 x4 : 000000000000011e x3 : ffffffbc00003e90 x2 : 0000000000000cc0 x1 : 00000000d0100371 x0 : ffffffbc00003e90 Reported-by: Sooyong Suk Signed-off-by: Heesub Shin Tested-by: Sooyong Suk Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ded3672295d7..e24f7ccc5865 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1398,11 +1398,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; - /* In huge class size, we store the handle into first_page->private */ - if (class->huge) { - size -= ZS_HANDLE_SIZE; - class = pool->size_class[get_size_class_index(size)]; - } spin_lock(&class->lock); first_page = find_get_zspage(class); -- cgit v1.2.3 From 160a117f0864871ae1bab26554a985a1d2861afd Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Apr 2015 16:16:24 -0700 Subject: zsmalloc: remove extra cond_resched() in __zs_compact Do not perform cond_resched() before the busy compaction loop in __zs_compact(), because this loop does it when needed. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e24f7ccc5865..08bd7a3d464a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1711,8 +1711,6 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct page *dst_page = NULL; unsigned long nr_total_migrated = 0; - cond_resched(); - spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { -- cgit v1.2.3 From 201c7b72f0bf38d7f31fd229a01de035d0f10cd1 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 15 Apr 2015 16:16:27 -0700 Subject: zram: fix error return code Return a negative error code on failure. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ identifier ret; expression e1,e2; @@ ( if (\(ret < 0\|ret != 0\)) { ... return ret; } | ret = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4491787095a0..c94386aa563d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1154,6 +1154,7 @@ static int create_device(struct zram *zram, int device_id) if (!zram->disk) { pr_warn("Error allocating disk structure for device %d\n", device_id); + ret = -ENOMEM; goto out_free_queue; } -- cgit v1.2.3 From e4bc33245124db69b74a6d853ac76c2976f472d5 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Wed, 15 Apr 2015 16:16:30 -0700 Subject: /proc/PID/status: show all sets of pid according to ns If some issues occurred inside a container guest, host user could not know which process is in trouble just by guest pid: the users of container guest only knew the pid inside containers. This will bring obstacle for trouble shooting. This patch adds four fields: NStgid, NSpid, NSpgid and NSsid: a) In init_pid_ns, nothing changed; b) In one pidns, will tell the pid inside containers: NStgid: 21776 5 1 NSpid: 21776 5 1 NSpgid: 21776 5 1 NSsid: 21729 1 0 ** Process id is 21776 in level 0, 5 in level 1, 1 in level 2. c) If pidns is nested, it depends on which pidns are you in. NStgid: 5 1 NSpid: 5 1 NSpgid: 5 1 NSsid: 1 0 ** Views from level 1 [akpm@linux-foundation.org: add CONFIG_PID_NS ifdef] Signed-off-by: Chen Hanxiao Acked-by: Serge Hallyn Acked-by: "Eric W. Biederman" Tested-by: Serge Hallyn Tested-by: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 1295a00ca316..a4490c0a4644 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -188,6 +188,24 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); +#ifdef CONFIG_PID_NS + seq_puts(m, "\nNStgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSsid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_session_nr_ns(p, pid->numbers[g].ns)); +#endif seq_putc(m, '\n'); } -- cgit v1.2.3 From 23f40a94d860449f39f00c3350bf850d15983e63 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:16:33 -0700 Subject: include/linux: remove empty conditionals Commit 607ca46e97a1 ("UAPI: (Scripted) Disintegrate include/linux") left behind some empty conditional blocks. Since they are useless and may cause a reader to wonder whether something is missing, remove them. Signed-off-by: Rasmus Villemoes Cc: Geert Uytterhoeven Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/a.out.h | 67 --------------------------------------------------- include/linux/types.h | 6 ----- 2 files changed, 73 deletions(-) diff --git a/include/linux/a.out.h b/include/linux/a.out.h index 220f14338895..ee884168989f 100644 --- a/include/linux/a.out.h +++ b/include/linux/a.out.h @@ -4,44 +4,6 @@ #include #ifndef __ASSEMBLY__ -#if defined (M_OLDSUN2) -#else -#endif -#if defined (M_68010) -#else -#endif -#if defined (M_68020) -#else -#endif -#if defined (M_SPARC) -#else -#endif -#if !defined (N_MAGIC) -#endif -#if !defined (N_BADMAG) -#endif -#if !defined (N_TXTOFF) -#endif -#if !defined (N_DATOFF) -#endif -#if !defined (N_TRELOFF) -#endif -#if !defined (N_DRELOFF) -#endif -#if !defined (N_SYMOFF) -#endif -#if !defined (N_STROFF) -#endif -#if !defined (N_TXTADDR) -#endif -#if defined(vax) || defined(hp300) || defined(pyr) -#endif -#ifdef sony -#endif /* Sony. */ -#ifdef is68k -#endif -#if defined(m68k) && defined(PORTAR) -#endif #ifdef linux #include #if defined(__i386__) || defined(__mc68000__) @@ -51,34 +13,5 @@ #endif #endif #endif -#ifndef N_DATADDR -#endif -#if !defined (N_BSSADDR) -#endif -#if !defined (N_NLIST_DECLARED) -#endif /* no N_NLIST_DECLARED. */ -#if !defined (N_UNDF) -#endif -#if !defined (N_ABS) -#endif -#if !defined (N_TEXT) -#endif -#if !defined (N_DATA) -#endif -#if !defined (N_BSS) -#endif -#if !defined (N_FN) -#endif -#if !defined (N_EXT) -#endif -#if !defined (N_TYPE) -#endif -#if !defined (N_STAB) -#endif -#if !defined (N_RELOCATION_INFO_DECLARED) -#ifdef NS32K -#else -#endif -#endif /* no N_RELOCATION_INFO_DECLARED. */ #endif /*__ASSEMBLY__ */ #endif /* __A_OUT_GNU_H__ */ diff --git a/include/linux/types.h b/include/linux/types.h index 6747247e3f9f..59698be03490 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -146,12 +146,6 @@ typedef u64 dma_addr_t; typedef u32 dma_addr_t; #endif /* dma_addr_t */ -#ifdef __CHECKER__ -#else -#endif -#ifdef __CHECK_ENDIAN__ -#else -#endif typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; typedef unsigned __bitwise__ oom_flags_t; -- cgit v1.2.3 From 946e87981942552e526aca9cb6204f02a6c847cb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 Apr 2015 16:16:36 -0700 Subject: paride: fix the "verbose" module param The verbose module parameter can be set to 2 for extremely verbose messages so the type should be int instead of bool. Signed-off-by: Dan Carpenter Cc: Tim Waugh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/paride/pg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 2ce3dfd7e6b9..876d0c3eaf58 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -137,7 +137,7 @@ */ -static bool verbose = 0; +static int verbose; static int major = PG_MAJOR; static char *name = PG_NAME; static int disable = 0; @@ -168,7 +168,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; #include -module_param(verbose, bool, 0644); +module_param(verbose, int, 0644); module_param(major, int, 0); module_param(name, charp, 0); module_param_array(drive0, int, NULL, 0); -- cgit v1.2.3 From c79574abe2baddf569532e7e430e4977771dd25c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Apr 2015 16:16:39 -0700 Subject: lib/test-hexdump.c: fix initconst confusion const char *...[] is not const, but an array of pointer to const. So these arrays cannot be __initconst, but must be __initdata This fixes section conflicts with LTO. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test-hexdump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c index daf29a390a89..9846ff7428b3 100644 --- a/lib/test-hexdump.c +++ b/lib/test-hexdump.c @@ -18,26 +18,26 @@ static const unsigned char data_b[] = { static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; -static const char *test_data_1_le[] __initconst = { +static const char * const test_data_1_le[] __initconst = { "be", "32", "db", "7b", "0a", "18", "93", "b2", "70", "ba", "c4", "24", "7d", "83", "34", "9b", "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", "4c", "d1", "19", "99", "43", "b1", "af", "0c", }; -static const char *test_data_2_le[] __initconst = { +static const char *test_data_2_le[] __initdata = { "32be", "7bdb", "180a", "b293", "ba70", "24c4", "837d", "9b34", "9ca6", "ad31", "0f9c", "e9ac", "d14c", "9919", "b143", "0caf", }; -static const char *test_data_4_le[] __initconst = { +static const char *test_data_4_le[] __initdata = { "7bdb32be", "b293180a", "24c4ba70", "9b34837d", "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", }; -static const char *test_data_8_le[] __initconst = { +static const char *test_data_8_le[] __initdata = { "b293180a7bdb32be", "9b34837d24c4ba70", "e9ac0f9cad319ca6", "0cafb1439919d14c", }; -- cgit v1.2.3 From 2813893f8b197a14f1e1ddb04d99bce46817c84a Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Wed, 15 Apr 2015 16:16:41 -0700 Subject: kernel: conditionally support non-root users, groups and capabilities There are a lot of embedded systems that run most or all of their functionality in init, running as root:root. For these systems, supporting multiple users is not necessary. This patch adds a new symbol, CONFIG_MULTIUSER, that makes support for non-root users, non-root groups, and capabilities optional. It is enabled under CONFIG_EXPERT menu. When this symbol is not defined, UID and GID are zero in any possible case and processes always have all capabilities. The following syscalls are compiled out: setuid, setregid, setgid, setreuid, setresuid, getresuid, setresgid, getresgid, setgroups, getgroups, setfsuid, setfsgid, capget, capset. Also, groups.c is compiled out completely. In kernel/capability.c, capable function was moved in order to avoid adding two ifdef blocks. This change saves about 25 KB on a defconfig build. The most minimal kernels have total text sizes in the high hundreds of kB rather than low MB. (The 25k goes down a bit with allnoconfig, but not that much. The kernel was booted in Qemu. All the common functionalities work. Adding users/groups is not possible, failing with -ENOSYS. Bloat-o-meter output: add/remove: 7/87 grow/shrink: 19/397 up/down: 1675/-26325 (-24650) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Iulia Manda Reviewed-by: Josh Triplett Acked-by: Geert Uytterhoeven Tested-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 1 + drivers/staging/lustre/lustre/Kconfig | 1 + fs/nfs/Kconfig | 2 +- fs/nfsd/Kconfig | 1 + include/linux/capability.h | 29 +++++++++++++++++++++++++++++ include/linux/cred.h | 23 +++++++++++++++++++---- include/linux/uidgid.h | 12 ++++++++++++ init/Kconfig | 19 ++++++++++++++++++- kernel/Makefile | 4 +++- kernel/capability.c | 35 +++++++++++++++++++---------------- kernel/cred.c | 3 +++ kernel/groups.c | 3 --- kernel/sys.c | 2 ++ kernel/sys_ni.c | 14 ++++++++++++++ net/sunrpc/Kconfig | 2 ++ security/Kconfig | 1 + 16 files changed, 126 insertions(+), 26 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a5ced5c3c1e0..de2726a487b0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -328,6 +328,7 @@ config COMPAT select COMPAT_BINFMT_ELF if BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION + depends on MULTIUSER help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig index 6725467ef4d0..62c7bba75274 100644 --- a/drivers/staging/lustre/lustre/Kconfig +++ b/drivers/staging/lustre/lustre/Kconfig @@ -10,6 +10,7 @@ config LUSTRE_FS select CRYPTO_SHA1 select CRYPTO_SHA256 select CRYPTO_SHA512 + depends on MULTIUSER help This option enables Lustre file system client support. Choose Y here if you want to access a Lustre file system cluster. To compile diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index c7abc10279af..f31fd0dd92c6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,6 +1,6 @@ config NFS_FS tristate "NFS client support" - depends on INET && FILE_LOCKING + depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 683bf718aead..fc2d108f5272 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -6,6 +6,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + depends on MULTIUSER help Choose Y here if you want to allow other computers to access files residing on this system using Sun's Network File System diff --git a/include/linux/capability.h b/include/linux/capability.h index aa93e5ef594c..af9f0b9e80e6 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -205,6 +205,7 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, cap_intersect(permitted, __cap_nfsd_set)); } +#ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); @@ -213,6 +214,34 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); +#else +static inline bool has_capability(struct task_struct *t, int cap) +{ + return true; +} +static inline bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + return true; +} +static inline bool has_capability_noaudit(struct task_struct *t, int cap) +{ + return true; +} +static inline bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + return true; +} +static inline bool capable(int cap) +{ + return true; +} +static inline bool ns_capable(struct user_namespace *ns, int cap) +{ + return true; +} +#endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/include/linux/cred.h b/include/linux/cred.h index 2fb2ca2127ed..8b6c083e68a7 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -62,9 +62,27 @@ do { \ groups_free(group_info); \ } while (0) -extern struct group_info *groups_alloc(int); extern struct group_info init_groups; +#ifdef CONFIG_MULTIUSER +extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); + +extern int in_group_p(kgid_t); +extern int in_egroup_p(kgid_t); +#else +static inline void groups_free(struct group_info *group_info) +{ +} + +static inline int in_group_p(kgid_t grp) +{ + return 1; +} +static inline int in_egroup_p(kgid_t grp) +{ + return 1; +} +#endif extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); @@ -74,9 +92,6 @@ extern bool may_setgroups(void); #define GROUP_AT(gi, i) \ ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) -extern int in_group_p(kgid_t); -extern int in_egroup_p(kgid_t); - /* * The security context of a task * diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 2d1f9b627f91..0ee05da38899 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -29,6 +29,7 @@ typedef struct { #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } +#ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; @@ -38,6 +39,17 @@ static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } +#else +static inline uid_t __kuid_val(kuid_t uid) +{ + return 0; +} + +static inline gid_t __kgid_val(kgid_t gid) +{ + return 0; +} +#endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) diff --git a/init/Kconfig b/init/Kconfig index a905b7301e10..3b9df1aa35db 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -394,6 +394,7 @@ endchoice config BSD_PROCESS_ACCT bool "BSD Process Accounting" + depends on MULTIUSER help If you say Y here, a user level program will be able to instruct the kernel (via a special system call) to write process accounting @@ -420,6 +421,7 @@ config BSD_PROCESS_ACCT_V3 config TASKSTATS bool "Export task/process statistics through netlink" depends on NET + depends on MULTIUSER default n help Export selected statistics for tasks/processes through the @@ -1160,6 +1162,7 @@ config CHECKPOINT_RESTORE menuconfig NAMESPACES bool "Namespaces support" if EXPERT + depends on MULTIUSER default !EXPERT help Provides the way to make tasks work with different objects using @@ -1356,11 +1359,25 @@ menuconfig EXPERT config UID16 bool "Enable 16-bit UID system calls" if EXPERT - depends on HAVE_UID16 + depends on HAVE_UID16 && MULTIUSER default y help This enables the legacy 16-bit UID syscall wrappers. +config MULTIUSER + bool "Multiple users, groups and capabilities support" if EXPERT + default y + help + This option enables support for non-root users, groups and + capabilities. + + If you say N here, all processes will run with UID 0, GID 0, and all + possible capabilities. Saying N here also compiles out support for + system calls related to UIDs, GIDs, and capabilities, such as setuid, + setgid, and capset. + + If unsure, say Y here. + config SGETMASK_SYSCALL bool "sgetmask/ssetmask syscalls support" if EXPERT def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o smpboot.o + +obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) } __setup("no_file_caps", file_caps_disable); +#ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * @@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); +#endif /* CONFIG_MULTIUSER */ + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check @@ -411,22 +430,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, } EXPORT_SYMBOL(file_ns_capable); -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,6 +29,9 @@ static struct kmem_cache *cred_jar; +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + /* * The initial credentials for the initial task */ diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -9,9 +9,6 @@ #include #include -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - struct group_info *groups_alloc(int gidsetsize) { struct group_info *group_info; diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..3be344902316 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -325,6 +325,7 @@ out_unlock: * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -815,6 +816,7 @@ change_okay: commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -159,6 +159,20 @@ cond_syscall(sys_uselib); cond_syscall(sys_fadvise64); cond_syscall(sys_fadvise64_64); cond_syscall(sys_madvise); +cond_syscall(sys_setuid); +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_capget); +cond_syscall(sys_capset); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index fb78117b896c..9068e72aa73c 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -1,9 +1,11 @@ config SUNRPC tristate + depends on MULTIUSER config SUNRPC_GSS tristate select OID_REGISTRY + depends on MULTIUSER config SUNRPC_BACKCHANNEL bool diff --git a/security/Kconfig b/security/Kconfig index beb86b500adf..bf4ec46474b6 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -21,6 +21,7 @@ config SECURITY_DMESG_RESTRICT config SECURITY bool "Enable different security models" depends on SYSFS + depends on MULTIUSER help This allows you to choose different security modules to be configured into your kernel. -- cgit v1.2.3 From 96831c0a6738f88f89e7012f4df0a747514af0a0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 15 Apr 2015 16:16:44 -0700 Subject: kernel/resource.c: remove deprecated __check_region() and friends All users of __check_region(), check_region(), and check_mem_region() are gone. We got rid of the last user in v4.0-rc1. Remove them. bloat-o-meter on x86_64 shows: add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-102 (-102) function old new delta __kstrtab___check_region 15 - -15 __ksymtab___check_region 16 - -16 __check_region 71 - -71 Signed-off-by: Jakub Sitnicki Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 8 -------- kernel/resource.c | 32 -------------------------------- 2 files changed, 40 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 2c5250222278..388e3ae94f7a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -196,10 +196,8 @@ extern struct resource * __request_region(struct resource *, /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) -#define check_mem_region(start,n) __check_region(&iomem_resource, (start), (n)) #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) -extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -207,12 +205,6 @@ extern int release_mem_region_adjustable(struct resource *, resource_size_t, resource_size_t); #endif -static inline int __deprecated check_region(resource_size_t s, - resource_size_t n) -{ - return __check_region(&ioport_resource, s, n); -} - /* Wrappers for managed devices */ struct device; diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) * * request_region creates a new busy region. * - * check_region returns non-zero if the area is already busy. - * * release_region releases a matching busy region. */ @@ -1097,36 +1095,6 @@ struct resource * __request_region(struct resource *parent, } EXPORT_SYMBOL(__request_region); -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - free_resource(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - /** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor -- cgit v1.2.3 From 972fae6993cbbb934345011664dc703c0891dda3 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Wed, 15 Apr 2015 16:16:47 -0700 Subject: kernel/hung_task.c: change hung_task.c to use for_each_process_thread() In check_hung_uninterruptible_tasks() avoid the use of deprecated while_each_thread(). The "max_count" logic will prevent a livelock - see commit 0c740d0a ("introduce for_each_thread() to replace the buggy while_each_thread()"). Having said this let's use for_each_process_thread(). Signed-off-by: Aaron Tomlin Acked-by: Oleg Nesterov Cc: David Rientjes Cc: Dave Wysochanski Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) return; rcu_read_lock(); - do_each_thread(g, t) { + for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (!--batch_count) { @@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); - } while_each_thread(g, t); + } unlock: rcu_read_unlock(); } -- cgit v1.2.3 From 7975a9b7323d5a47e5b651bac810f3271e7156df Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 15 Apr 2015 16:16:50 -0700 Subject: drivers/sbus/char/envctrl.c: ignore orderly_poweroff return value orderly_poweroff() unconditionally returns 0, so remove the dead code that checks the return value. A future patch will change the return type to void. Signed-off-by: Joel Stanley Acked-by: David S. Miller Cc: Fabian Frederick Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Rusty Russell Cc: Jeremy Kerr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/sbus/char/bbc_envctrl.c | 3 +-- drivers/sbus/char/envctrl.c | 7 +------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c index 0787b9756165..228c782d6433 100644 --- a/drivers/sbus/char/bbc_envctrl.c +++ b/drivers/sbus/char/bbc_envctrl.c @@ -160,8 +160,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); shutting_down = 1; - if (orderly_poweroff(true) < 0) - printk(KERN_CRIT "envctrl: shutdown execution failed\n"); + orderly_poweroff(true); } #define WARN_INTERVAL (30 * HZ) diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c index e244cf3d9ec8..5609b602c54d 100644 --- a/drivers/sbus/char/envctrl.c +++ b/drivers/sbus/char/envctrl.c @@ -970,18 +970,13 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type) static void envctrl_do_shutdown(void) { static int inprog = 0; - int ret; if (inprog != 0) return; inprog = 1; printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); - ret = orderly_poweroff(true); - if (ret < 0) { - printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); - inprog = 0; /* unlikely to succeed, but we could try again */ - } + orderly_poweroff(true); } static struct task_struct *kenvctrld_task; -- cgit v1.2.3 From 7a54f46b301cfab8a0d7365aa186545f8b98f22e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 15 Apr 2015 16:16:53 -0700 Subject: kernel/reboot.c: add orderly_reboot for graceful reboot The kernel has orderly_poweroff which allows the kernel to initiate a graceful shutdown of userspace, by running /sbin/poweroff. This adds orderly_reboot that will cause userspace to shut itself down by calling /sbin/reboot. This will be used for shutdown initiated by a system controller on platforms that do not use ACPI. orderly_reboot() should be used when the system wants to allow userspace to gracefully shut itself down. For cases where the system may imminently catch on fire, the existing emergency_restart() provides an immediate reboot without involving userspace. Signed-off-by: Joel Stanley Cc: Fabian Frederick Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Rusty Russell Cc: Jeremy Kerr Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 3 ++- kernel/reboot.c | 53 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 67fc8fcdc4b0..a7ff409f386d 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -70,7 +70,8 @@ void ctrl_alt_del(void); #define POWEROFF_CMD_PATH_LEN 256 extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; -extern int orderly_poweroff(bool force); +extern void orderly_poweroff(bool force); +extern void orderly_reboot(void); /* * Emergency restart, callable from an interrupt handler. diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -387,8 +387,9 @@ void ctrl_alt_del(void) } char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +static const char reboot_cmd[] = "/sbin/reboot"; -static int __orderly_poweroff(bool force) +static int run_cmd(const char *cmd) { char **argv; static char *envp[] = { @@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) NULL }; int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + argv = argv_split(GFP_KERNEL, cmd, NULL); if (argv) { ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); @@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) ret = -ENOMEM; } + return ret; +} + +static int __orderly_reboot(void) +{ + int ret; + + ret = run_cmd(reboot_cmd); + + if (ret) { + pr_warn("Failed to start orderly reboot: forcing the issue\n"); + emergency_sync(); + kernel_restart(NULL); + } + + return ret; +} + +static int __orderly_poweroff(bool force) +{ + int ret; + + ret = run_cmd(poweroff_cmd); + if (ret && force) { pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); * This may be called from any context to trigger a system shutdown. * If the orderly shutdown fails, it will force an immediate shutdown. */ -int orderly_poweroff(bool force) +void orderly_poweroff(bool force) { if (force) /* do not override the pending "true" */ poweroff_force = true; schedule_work(&poweroff_work); - return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); +static void reboot_work_func(struct work_struct *work) +{ + __orderly_reboot(); +} + +static DECLARE_WORK(reboot_work, reboot_work_func); + +/** + * orderly_reboot - Trigger an orderly system reboot + * + * This may be called from any context to trigger a system reboot. + * If the orderly reboot fails, it will force an immediate reboot. + */ +void orderly_reboot(void) +{ + schedule_work(&reboot_work); +} +EXPORT_SYMBOL_GPL(orderly_reboot); + static int __init reboot_setup(char *str) { for (;;) { -- cgit v1.2.3 From e243304d0a51f5b27a0606c4e5c2ebba6854d20f Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 15 Apr 2015 16:16:56 -0700 Subject: powerpc/powernv: reboot when requested by firmware Use orderly_reboot so userspace will to shut itself down via the reboot path. This is required for graceful reboot initiated by the BMC, such as when a user uses ipmitool to issue a 'chassis power cycle' command. Signed-off-by: Joel Stanley Acked-by: Michael Ellerman Cc: Fabian Frederick Cc: Benjamin Herrenschmidt Cc: Rusty Russell Cc: Jeremy Kerr Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/powernv/opal-power.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c index 48bf5b080bcf..ac46c2c24f99 100644 --- a/arch/powerpc/platforms/powernv/opal-power.c +++ b/arch/powerpc/platforms/powernv/opal-power.c @@ -29,8 +29,9 @@ static int opal_power_control_event(struct notifier_block *nb, switch (type) { case SOFT_REBOOT: - /* Fall through. The service processor is responsible for - * bringing the machine back up */ + pr_info("OPAL: reboot requested\n"); + orderly_reboot(); + break; case SOFT_OFF: pr_info("OPAL: poweroff requested\n"); orderly_poweroff(true); -- cgit v1.2.3 From 7b1460eccad0621e0c48f52bbedeb7adddc55ac9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2015 16:16:59 -0700 Subject: printk: comment pr_cont() stating it is only to continue a line KERN_CONT is nicely commented in kern_levels.h, but pr_cont() is now used more often, and it lacks the comment stating what it is used for. It can be confused as continuing the log level, but that is not its purpose. Its purpose is to continue a line that had no newline enclosed. This should be documented by pr_cont() as well. Signed-off-by: Steven Rostedt Acked-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/printk.h b/include/linux/printk.h index baa3f97d8ce8..9b30871c9149 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -255,6 +255,11 @@ extern asmlinkage void dump_stack(void) __cold; printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +/* + * Like KERN_CONT, pr_cont() should only be used when continuing + * a line with no newline ('\n') enclosed. Otherwise it defaults + * back to KERN_DEFAULT. + */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) -- cgit v1.2.3 From 51be17dfff753e72872f096ce1e654734f2fec50 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:02 -0700 Subject: lib/vsprintf.c: eliminate some branches Since FORMAT_TYPE_INT is simply 1 more than FORMAT_TYPE_UINT, and similarly for BYTE/UBYTE, SHORT/USHORT, LONG/ULONG, we can eliminate a few instructions by making SIGN have the value 1 instead of 2, and then use arithmetic instead of branches for computing the right spec->type. It's a little hacky, but certainly in the same spirit as SMALL needing to have the value 0x20. For example for the spec->qualifier == 'l' case, gcc now generates 75e: 0f b6 53 01 movzbl 0x1(%rbx),%edx 762: 83 e2 01 and $0x1,%edx 765: 83 c2 09 add $0x9,%edx 768: 88 13 mov %dl,(%rbx) instead of 763: 0f b6 53 01 movzbl 0x1(%rbx),%edx 767: 83 e2 02 and $0x2,%edx 76a: 80 fa 01 cmp $0x1,%dl 76d: 19 d2 sbb %edx,%edx 76f: 83 c2 0a add $0xa,%edx 772: 88 13 mov %dl,(%rbx) Signed-off-by: Rasmus Villemoes Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b235c96167d3..318d583fe862 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -340,8 +340,8 @@ int num_to_str(char *buf, int size, unsigned long long num) return len; } -#define ZEROPAD 1 /* pad with zero */ -#define SIGN 2 /* unsigned/signed long */ +#define SIGN 1 /* unsigned/signed, must be 1 */ +#define ZEROPAD 2 /* pad with zero */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ @@ -447,7 +447,7 @@ char *number(char *buf, char *end, unsigned long long num, spec.precision = i; /* leading space padding */ spec.field_width -= spec.precision; - if (!(spec.flags & (ZEROPAD+LEFT))) { + if (!(spec.flags & (ZEROPAD | LEFT))) { while (--spec.field_width >= 0) { if (buf < end) *buf = ' '; @@ -1738,29 +1738,21 @@ qualifier: if (spec->qualifier == 'L') spec->type = FORMAT_TYPE_LONG_LONG; else if (spec->qualifier == 'l') { - if (spec->flags & SIGN) - spec->type = FORMAT_TYPE_LONG; - else - spec->type = FORMAT_TYPE_ULONG; + BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); + spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); } else if (_tolower(spec->qualifier) == 'z') { spec->type = FORMAT_TYPE_SIZE_T; } else if (spec->qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; } else if (spec->qualifier == 'H') { - if (spec->flags & SIGN) - spec->type = FORMAT_TYPE_BYTE; - else - spec->type = FORMAT_TYPE_UBYTE; + BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); + spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); } else if (spec->qualifier == 'h') { - if (spec->flags & SIGN) - spec->type = FORMAT_TYPE_SHORT; - else - spec->type = FORMAT_TYPE_USHORT; + BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); + spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); } else { - if (spec->flags & SIGN) - spec->type = FORMAT_TYPE_INT; - else - spec->type = FORMAT_TYPE_UINT; + BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT); + spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN); } return ++fmt - start; -- cgit v1.2.3 From e26c12c777118d65aeb97eb1651338e7decae80e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:05 -0700 Subject: lib/vsprintf.c: reduce stack use in number() At least since the initial git commit, when base was passed as a separate parameter, number() has only been called with bases 8, 10 and 16. I'm guessing that 66 was to accommodate 64 0/1, a sign and a '\0', but the buffer is only used for the actual digits. Octal digits carry 3 bits of information, so 24 is enough. Spell that 3*sizeof(num) so one less place needs to be changed should long long ever be 128 bits. Also remove the commented-out code that would handle an arbitrary base. Signed-off-by: Rasmus Villemoes Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 318d583fe862..98f1ce907d3d 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -386,7 +386,7 @@ char *number(char *buf, char *end, unsigned long long num, /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ - char tmp[66]; + char tmp[3 * sizeof(num)]; char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); @@ -423,11 +423,6 @@ char *number(char *buf, char *end, unsigned long long num, i = 0; if (num < spec.base) tmp[i++] = digits[num] | locase; - /* Generic code, for any base: - else do { - tmp[i++] = (digits[do_div(num,base)] | locase); - } while (num != 0); - */ else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; -- cgit v1.2.3 From 3ea8d440a86b85c63c2bb7f73988626e682db5f0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:08 -0700 Subject: lib/vsprintf.c: eliminate duplicate hex string array gcc doesn't merge or overlap const char[] objects with identical contents (probably language lawyers would also insist that these things have different addresses), but there's no reason to have the string "0123456789ABCDEF" occur in multiple places. hex_asc_upper is declared in kernel.h and defined in lib/hexdump.c, which is unconditionally compiled in. Signed-off-by: Rasmus Villemoes Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 98f1ce907d3d..7a299d43987a 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -383,9 +383,6 @@ static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { - /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ - static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ - char tmp[3 * sizeof(num)]; char sign; char locase; @@ -422,7 +419,7 @@ char *number(char *buf, char *end, unsigned long long num, /* generate full string in tmp[], in reverse order */ i = 0; if (num < spec.base) - tmp[i++] = digits[num] | locase; + tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; @@ -430,7 +427,7 @@ char *number(char *buf, char *end, unsigned long long num, if (spec.base == 16) shift = 4; do { - tmp[i++] = (digits[((unsigned char)num) & mask] | locase); + tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ -- cgit v1.2.3 From d1c1b12137fff14363d0cf45c8b7a9ec5cd4578b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:11 -0700 Subject: lib/vsprintf.c: another small hack Making ZEROPAD == '0'-' ', we can eliminate a few more instructions. Signed-off-by: Rasmus Villemoes Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7a299d43987a..2753f9261115 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -341,10 +341,10 @@ int num_to_str(char *buf, int size, unsigned long long num) } #define SIGN 1 /* unsigned/signed, must be 1 */ -#define ZEROPAD 2 /* pad with zero */ +#define LEFT 2 /* left justified */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ -#define LEFT 16 /* left justified */ +#define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ @@ -467,7 +467,8 @@ char *number(char *buf, char *end, unsigned long long num, } /* zero or space padding */ if (!(spec.flags & LEFT)) { - char c = (spec.flags & ZEROPAD) ? '0' : ' '; + char c = ' ' + (spec.flags & ZEROPAD); + BUILD_BUG_ON(' ' + ZEROPAD != '0'); while (--spec.field_width >= 0) { if (buf < end) *buf = c; -- cgit v1.2.3 From 7330660ed2e8d8d4c65b90cea62d8f1ed49c0104 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Apr 2015 16:17:14 -0700 Subject: lib/vsprintf: document %p parameters passed by reference This patch series improves the documentation for printk() formats, and adds support for printing clocks. The latter has always been a hassle if you wanted to support both the common and legacy clock frameworks. - '%pC' and '%pCn' print the name (Common Clock Framework) or address (legacy clock framework) of a clock, - '%pCr' prints the current clock rate. This patch (of 3): Make sure all %p extensions that take parameters by references are documented to do so. Signed-off-by: Geert Uytterhoeven Cc: Jonathan Corbet Cc: Mike Turquette Cc: Stephen Boyd Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/printk-formats.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 5a615c14f75d..71438f3eb0c0 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -54,6 +54,7 @@ Struct Resources: For printing struct resources. The 'R' and 'r' specifiers result in a printed resource with ('R') or without ('r') a decoded flags member. + Passed by reference. Physical addresses types phys_addr_t: @@ -132,6 +133,8 @@ MAC/FDDI addresses: specifier to use reversed byte order suitable for visual interpretation of Bluetooth addresses which are in the little endian order. + Passed by reference. + IPv4 addresses: %pI4 1.2.3.4 @@ -146,6 +149,8 @@ IPv4 addresses: host, network, big or little endian order addresses respectively. Where no specifier is provided the default network/big endian order is used. + Passed by reference. + IPv6 addresses: %pI6 0001:0002:0003:0004:0005:0006:0007:0008 @@ -160,6 +165,8 @@ IPv6 addresses: print a compressed IPv6 address as described by http://tools.ietf.org/html/rfc5952 + Passed by reference. + IPv4/IPv6 addresses (generic, with port, flowinfo, scope): %pIS 1.2.3.4 or 0001:0002:0003:0004:0005:0006:0007:0008 @@ -186,6 +193,8 @@ IPv4/IPv6 addresses (generic, with port, flowinfo, scope): specifiers can be used as well and are ignored in case of an IPv6 address. + Passed by reference. + Further examples: %pISfc 1.2.3.4 or [1:2:3:4:5:6:7:8]/123456789 @@ -207,6 +216,8 @@ UUID/GUID addresses: Where no additional specifiers are used the default little endian order with lower case hex characters will be printed. + Passed by reference. + dentry names: %pd{,2,3,4} %pD{,2,3,4} @@ -216,6 +227,8 @@ dentry names: equivalent of %s dentry->d_name.name we used to use, %pd prints n last components. %pD does the same thing for struct file. + Passed by reference. + struct va_format: %pV @@ -231,6 +244,8 @@ struct va_format: Do not use this feature without some mechanism to verify the correctness of the format string and va_list arguments. + Passed by reference. + u64 SHOULD be printed with %llu/%llx: printk("%llu", u64_var); -- cgit v1.2.3 From e8a7ba5f5c8c0c94c0c7f1bcd53c0289560c7446 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Apr 2015 16:17:17 -0700 Subject: lib/vsprintf: Move integer format types to the top Move the format types for 64-bit integers and configurable size integers to the top, so they're next to the other integer format types. While at it, add the missing format types for s32 and u32. Signed-off-by: Geert Uytterhoeven Cc: Jonathan Corbet Cc: Mike Turquette Cc: Stephen Boyd Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/printk-formats.txt | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 71438f3eb0c0..56804e40cb18 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -8,6 +8,21 @@ If variable is of Type, use printk format specifier: unsigned long long %llu or %llx size_t %zu or %zx ssize_t %zd or %zx + s32 %d or %x + u32 %u or %x + s64 %lld or %llx + u64 %llu or %llx + +If is dependent on a config option for its size (e.g., sector_t, +blkcnt_t) or is architecture-dependent for its size (e.g., tcflag_t), use a +format specifier of its largest possible type and explicitly cast to it. +Example: + + printk("test: sector number/total blocks: %llu/%llu\n", + (unsigned long long)sector, (unsigned long long)blockcount); + +Reminder: sizeof() result is of type size_t. + Raw pointer value SHOULD be printed with %p. The kernel supports the following extended format specifiers for pointer types: @@ -246,23 +261,6 @@ struct va_format: Passed by reference. -u64 SHOULD be printed with %llu/%llx: - - printk("%llu", u64_var); - -s64 SHOULD be printed with %lld/%llx: - - printk("%lld", s64_var); - -If is dependent on a config option for its size (e.g., sector_t, -blkcnt_t) or is architecture-dependent for its size (e.g., tcflag_t), use a -format specifier of its largest possible type and explicitly cast to it. -Example: - - printk("test: sector number/total blocks: %llu/%llu\n", - (unsigned long long)sector, (unsigned long long)blockcount); - -Reminder: sizeof() result is of type size_t. Thank you for your cooperation and attention. -- cgit v1.2.3 From 900cca2944254edd2d54dc181629314d3177a4af Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Apr 2015 16:17:20 -0700 Subject: lib/vsprintf: add %pC{,n,r} format specifiers for clocks Add format specifiers for printing struct clk: - '%pC' or '%pCn': name (Common Clock Framework) or address (legacy clock framework) of the clock, - '%pCr': rate of the clock. [akpm@linux-foundation.org: omit code if !CONFIG_HAVE_CLK] Signed-off-by: Geert Uytterhoeven Cc: Jonathan Corbet Cc: Mike Turquette Cc: Stephen Boyd Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/printk-formats.txt | 12 ++++++++++++ lib/vsprintf.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 56804e40cb18..cb6a596072bb 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -261,6 +261,18 @@ struct va_format: Passed by reference. +struct clk: + + %pC pll1 + %pCn pll1 + %pCr 1560000000 + + For printing struct clk structures. '%pC' and '%pCn' print the name + (Common Clock Framework) or address (legacy clock framework) of the + structure; '%pCr' prints the current clock rate. + + Passed by reference. + Thank you for your cooperation and attention. diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 2753f9261115..3ab8c9cf3980 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -17,6 +17,7 @@ */ #include +#include #include /* for KSYM_SYMBOL_LEN */ #include #include @@ -1315,6 +1316,30 @@ char *address_val(char *buf, char *end, const void *addr, return number(buf, end, num, spec); } +static noinline_for_stack +char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, + const char *fmt) +{ + if (!IS_ENABLED(CONFIG_HAVE_CLK) || !clk) + return string(buf, end, NULL, spec); + + switch (fmt[1]) { + case 'r': + return number(buf, end, clk_get_rate(clk), spec); + + case 'n': + default: +#ifdef CONFIG_COMMON_CLK + return string(buf, end, __clk_get_name(clk), spec); +#else + spec.base = 16; + spec.field_width = sizeof(unsigned long) * 2 + 2; + spec.flags |= SPECIAL | SMALL | ZEROPAD; + return number(buf, end, (unsigned long)clk, spec); +#endif + } +} + int kptr_restrict __read_mostly; /* @@ -1397,6 +1422,11 @@ int kptr_restrict __read_mostly; * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file + * - 'C' For a clock, it prints the name (Common Clock Framework) or address + * (legacy clock framework) of the clock + * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address + * (legacy clock framework) of the clock + * - 'Cr' For a clock, it prints the current rate of the clock * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1541,6 +1571,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return address_val(buf, end, ptr, spec, fmt); case 'd': return dentry_name(buf, end, ptr, spec, fmt); + case 'C': + return clock(buf, end, ptr, spec, fmt); case 'D': return dentry_name(buf, end, ((const struct file *)ptr)->f_path.dentry, @@ -1785,6 +1817,11 @@ qualifier: * %*pE[achnops] print an escaped buffer * %*ph[CDN] a variable-length hex string with a separator (supports up to 64 * bytes of the input) + * %pC output the name (Common Clock Framework) or address (legacy clock + * framework) of a clock + * %pCn output the name (Common Clock Framework) or address (legacy clock + * framework) of a clock + * %pCr output the current rate of a clock * %n is ignored * * ** Please update Documentation/printk-formats.txt when making changes ** -- cgit v1.2.3 From 9c98f2359600386efd1c6adfabc5a04118a79798 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:23 -0700 Subject: lib/vsprintf.c: fix potential NULL deref in hex_string The helper hex_string() is broken in two ways. First, it doesn't increment buf regardless of whether there is room to print, so callers such as kasprintf() that try to probe the correct storage to allocate will get a too small return value. But even worse, kasprintf() (and likely anyone else trying to find the size of the result) pass NULL for buf and 0 for size, so we also have end == NULL. But this means that the end-1 in hex_string() is (char*)-1, so buf < end-1 is true and we get a NULL pointer deref. I double-checked this with a trivial kernel module that just did a kasprintf(GFP_KERNEL, "%14ph", "CrashBoomBang"). Nobody seems to be using %ph with kasprintf, but we might as well fix it before it hits someone. Signed-off-by: Rasmus Villemoes Acked-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3ab8c9cf3980..4da1e7aaf9d5 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -777,11 +777,19 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, if (spec.field_width > 0) len = min_t(int, spec.field_width, 64); - for (i = 0; i < len && buf < end - 1; i++) { - buf = hex_byte_pack(buf, addr[i]); + for (i = 0; i < len; ++i) { + if (buf < end) + *buf = hex_asc_hi(addr[i]); + ++buf; + if (buf < end) + *buf = hex_asc_lo(addr[i]); + ++buf; - if (buf < end && separator && i != len - 1) - *buf++ = separator; + if (separator && i != len - 1) { + if (buf < end) + *buf = separator; + ++buf; + } } return buf; -- cgit v1.2.3 From 3aeddc7d665e41b1ba193f5c427ca52086d085ae Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:25 -0700 Subject: lib/string_helpers.c: refactor string_escape_mem When printf is given the format specifier %pE, it needs a way of obtaining the total output size that would be generated if the buffer was large enough, and string_escape_mem doesn't easily provide that. This is a refactorization of string_escape_mem in preparation of changing its external API to provide that information. The somewhat ugly early returns and subsequent seemingly redundant conditionals are to make the following patch touch as little as possible in string_helpers.c while still preserving the current behaviour of never outputting partial escape sequences. That behaviour must also change for %pE to work as one expects from every other printf specifier. Signed-off-by: Rasmus Villemoes Acked-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string_helpers.c | 208 ++++++++++++++++++++++++++------------------------- 1 file changed, 105 insertions(+), 103 deletions(-) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 8f8c4417f228..9c48ddad0f0d 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -239,29 +239,21 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) } EXPORT_SYMBOL(string_unescape); -static int escape_passthrough(unsigned char c, char **dst, size_t *osz) +static bool escape_passthrough(unsigned char c, char **dst, char *end) { char *out = *dst; - if (*osz < 1) - return -ENOMEM; - - *out++ = c; - - *dst = out; - *osz -= 1; - - return 1; + if (out < end) + *out = c; + *dst = out + 1; + return true; } -static int escape_space(unsigned char c, char **dst, size_t *osz) +static bool escape_space(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; - if (*osz < 2) - return -ENOMEM; - switch (c) { case '\n': to = 'n'; @@ -279,26 +271,30 @@ static int escape_space(unsigned char c, char **dst, size_t *osz) to = 'f'; break; default: - return 0; + return false; } - *out++ = '\\'; - *out++ = to; + if (out + 2 > end) { + *dst = out + 2; + return true; + } - *dst = out; - *osz -= 2; + if (out < end) + *out = '\\'; + ++out; + if (out < end) + *out = to; + ++out; - return 1; + *dst = out; + return true; } -static int escape_special(unsigned char c, char **dst, size_t *osz) +static bool escape_special(unsigned char c, char **dst, char *end) { char *out = *dst; unsigned char to; - if (*osz < 2) - return -ENOMEM; - switch (c) { case '\\': to = '\\'; @@ -310,71 +306,98 @@ static int escape_special(unsigned char c, char **dst, size_t *osz) to = 'e'; break; default: - return 0; + return false; } - *out++ = '\\'; - *out++ = to; + if (out + 2 > end) { + *dst = out + 2; + return true; + } - *dst = out; - *osz -= 2; + if (out < end) + *out = '\\'; + ++out; + if (out < end) + *out = to; + ++out; - return 1; + *dst = out; + return true; } -static int escape_null(unsigned char c, char **dst, size_t *osz) +static bool escape_null(unsigned char c, char **dst, char *end) { char *out = *dst; - if (*osz < 2) - return -ENOMEM; - if (c) - return 0; + return false; - *out++ = '\\'; - *out++ = '0'; + if (out + 2 > end) { + *dst = out + 2; + return true; + } - *dst = out; - *osz -= 2; + if (out < end) + *out = '\\'; + ++out; + if (out < end) + *out = '0'; + ++out; - return 1; + *dst = out; + return true; } -static int escape_octal(unsigned char c, char **dst, size_t *osz) +static bool escape_octal(unsigned char c, char **dst, char *end) { char *out = *dst; - if (*osz < 4) - return -ENOMEM; + if (out + 4 > end) { + *dst = out + 4; + return true; + } - *out++ = '\\'; - *out++ = ((c >> 6) & 0x07) + '0'; - *out++ = ((c >> 3) & 0x07) + '0'; - *out++ = ((c >> 0) & 0x07) + '0'; + if (out < end) + *out = '\\'; + ++out; + if (out < end) + *out = ((c >> 6) & 0x07) + '0'; + ++out; + if (out < end) + *out = ((c >> 3) & 0x07) + '0'; + ++out; + if (out < end) + *out = ((c >> 0) & 0x07) + '0'; + ++out; *dst = out; - *osz -= 4; - - return 1; + return true; } -static int escape_hex(unsigned char c, char **dst, size_t *osz) +static bool escape_hex(unsigned char c, char **dst, char *end) { char *out = *dst; - if (*osz < 4) - return -ENOMEM; + if (out + 4 > end) { + *dst = out + 4; + return true; + } - *out++ = '\\'; - *out++ = 'x'; - *out++ = hex_asc_hi(c); - *out++ = hex_asc_lo(c); + if (out < end) + *out = '\\'; + ++out; + if (out < end) + *out = 'x'; + ++out; + if (out < end) + *out = hex_asc_hi(c); + ++out; + if (out < end) + *out = hex_asc_lo(c); + ++out; *dst = out; - *osz -= 4; - - return 1; + return true; } /** @@ -436,9 +459,10 @@ static int escape_hex(unsigned char c, char **dst, size_t *osz) int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, unsigned int flags, const char *esc) { - char *out = *dst, *p = out; + char *p = *dst; + char *end = p + osz; bool is_dict = esc && *esc; - int ret = 0; + int ret; while (isz--) { unsigned char c = *src++; @@ -458,55 +482,33 @@ int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, (is_dict && !strchr(esc, c))) { /* do nothing */ } else { - if (flags & ESCAPE_SPACE) { - ret = escape_space(c, &p, &osz); - if (ret < 0) - break; - if (ret > 0) - continue; - } - - if (flags & ESCAPE_SPECIAL) { - ret = escape_special(c, &p, &osz); - if (ret < 0) - break; - if (ret > 0) - continue; - } - - if (flags & ESCAPE_NULL) { - ret = escape_null(c, &p, &osz); - if (ret < 0) - break; - if (ret > 0) - continue; - } + if (flags & ESCAPE_SPACE && escape_space(c, &p, end)) + continue; + + if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end)) + continue; + + if (flags & ESCAPE_NULL && escape_null(c, &p, end)) + continue; /* ESCAPE_OCTAL and ESCAPE_HEX always go last */ - if (flags & ESCAPE_OCTAL) { - ret = escape_octal(c, &p, &osz); - if (ret < 0) - break; + if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end)) continue; - } - if (flags & ESCAPE_HEX) { - ret = escape_hex(c, &p, &osz); - if (ret < 0) - break; + + if (flags & ESCAPE_HEX && escape_hex(c, &p, end)) continue; - } } - ret = escape_passthrough(c, &p, &osz); - if (ret < 0) - break; + escape_passthrough(c, &p, end); } - *dst = p; - - if (ret < 0) - return ret; + if (p > end) { + *dst = end; + return -ENOMEM; + } - return p - out; + ret = p - *dst; + *dst = p; + return ret; } EXPORT_SYMBOL(string_escape_mem); -- cgit v1.2.3 From 41416f2330112d29f2cfa337bfc7e672bf0c2768 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:28 -0700 Subject: lib/string_helpers.c: change semantics of string_escape_mem The current semantics of string_escape_mem are inadequate for one of its current users, vsnprintf(). If that is to honour its contract, it must know how much space would be needed for the entire escaped buffer, and string_escape_mem provides no way of obtaining that (short of allocating a large enough buffer (~4 times input string) to let it play with, and that's definitely a big no-no inside vsnprintf). So change the semantics for string_escape_mem to be more snprintf-like: Return the size of the output that would be generated if the destination buffer was big enough, but of course still only write to the part of dst it is allowed to, and (contrary to snprintf) don't do '\0'-termination. It is then up to the caller to detect whether output was truncated and to append a '\0' if desired. Also, we must output partial escape sequences, otherwise a call such as snprintf(buf, 3, "%1pE", "\123") would cause printf to write a \0 to buf[2] but leaving buf[0] and buf[1] with whatever they previously contained. This also fixes a bug in the escaped_string() helper function, which used to unconditionally pass a length of "end-buf" to string_escape_mem(); since the latter doesn't check osz for being insanely large, it would happily write to dst. For example, kasprintf(GFP_KERNEL, "something and then %pE", ...); is an easy way to trigger an oops. In test-string_helpers.c, the -ENOMEM test is replaced with testing for getting the expected return value even if the buffer is too small. We also ensure that nothing is written (by relying on a NULL pointer deref) if the output size is 0 by passing NULL - this has to work for kasprintf("%pE") to work. In net/sunrpc/cache.c, I think qword_add still has the same semantics. Someone should definitely double-check this. In fs/proc/array.c, I made the minimum possible change, but longer-term it should stop poking around in seq_file internals. [andriy.shevchenko@linux.intel.com: simplify qword_add] [andriy.shevchenko@linux.intel.com: add missed curly braces] Signed-off-by: Rasmus Villemoes Acked-by: Andy Shevchenko Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- include/linux/string_helpers.h | 8 +++---- lib/string_helpers.c | 49 ++++++------------------------------------ lib/test-string_helpers.c | 40 +++++++++++++++++----------------- lib/vsprintf.c | 8 +++++-- net/sunrpc/cache.c | 8 ++++--- 6 files changed, 44 insertions(+), 73 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index a4490c0a4644..13f047ad08e4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) buf = m->buf + m->count; /* Ignore error for now */ - string_escape_str(tcomm, &buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + buf += string_escape_str(tcomm, buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); m->count = buf - m->buf; seq_putc(m, '\n'); diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 657571817260..0991913f4953 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -47,22 +47,22 @@ static inline int string_unescape_any_inplace(char *buf) #define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP) #define ESCAPE_HEX 0x20 -int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, +int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *esc); static inline int string_escape_mem_any_np(const char *src, size_t isz, - char **dst, size_t osz, const char *esc) + char *dst, size_t osz, const char *esc) { return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc); } -static inline int string_escape_str(const char *src, char **dst, size_t sz, +static inline int string_escape_str(const char *src, char *dst, size_t sz, unsigned int flags, const char *esc) { return string_escape_mem(src, strlen(src), dst, sz, flags, esc); } -static inline int string_escape_str_any_np(const char *src, char **dst, +static inline int string_escape_str_any_np(const char *src, char *dst, size_t sz, const char *esc) { return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 9c48ddad0f0d..1826c7407258 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -274,11 +274,6 @@ static bool escape_space(unsigned char c, char **dst, char *end) return false; } - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -309,11 +304,6 @@ static bool escape_special(unsigned char c, char **dst, char *end) return false; } - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -332,11 +322,6 @@ static bool escape_null(unsigned char c, char **dst, char *end) if (c) return false; - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -352,11 +337,6 @@ static bool escape_octal(unsigned char c, char **dst, char *end) { char *out = *dst; - if (out + 4 > end) { - *dst = out + 4; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -378,11 +358,6 @@ static bool escape_hex(unsigned char c, char **dst, char *end) { char *out = *dst; - if (out + 4 > end) { - *dst = out + 4; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -449,20 +424,17 @@ static bool escape_hex(unsigned char c, char **dst, char *end) * it if needs. * * Return: - * The amount of the characters processed to the destination buffer, or - * %-ENOMEM if the size of buffer is not enough to put an escaped character is - * returned. - * - * Even in the case of error @dst pointer will be updated to point to the byte - * after the last processed character. + * The total size of the escaped output that would be generated for + * the given input and flags. To check whether the output was + * truncated, compare the return value to osz. There is room left in + * dst for a '\0' terminator if and only if ret < osz. */ -int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, +int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *esc) { - char *p = *dst; + char *p = dst; char *end = p + osz; bool is_dict = esc && *esc; - int ret; while (isz--) { unsigned char c = *src++; @@ -502,13 +474,6 @@ int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, escape_passthrough(c, &p, end); } - if (p > end) { - *dst = end; - return -ENOMEM; - } - - ret = p - *dst; - *dst = p; - return ret; + return p - dst; } EXPORT_SYMBOL(string_escape_mem); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index ab0d30e1e18f..8e376efd88a4 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -260,16 +260,28 @@ static __init const char *test_string_find_match(const struct test_string_2 *s2, return NULL; } +static __init void +test_string_escape_overflow(const char *in, int p, unsigned int flags, const char *esc, + int q_test, const char *name) +{ + int q_real; + + q_real = string_escape_mem(in, p, NULL, 0, flags, esc); + if (q_real != q_test) + pr_warn("Test '%s' failed: flags = %u, osz = 0, expected %d, got %d\n", + name, flags, q_test, q_real); +} + static __init void test_string_escape(const char *name, const struct test_string_2 *s2, unsigned int flags, const char *esc) { - int q_real = 512; - char *out_test = kmalloc(q_real, GFP_KERNEL); - char *out_real = kmalloc(q_real, GFP_KERNEL); + size_t out_size = 512; + char *out_test = kmalloc(out_size, GFP_KERNEL); + char *out_real = kmalloc(out_size, GFP_KERNEL); char *in = kmalloc(256, GFP_KERNEL); - char *buf = out_real; int p = 0, q_test = 0; + int q_real; if (!out_test || !out_real || !in) goto out; @@ -301,29 +313,19 @@ static __init void test_string_escape(const char *name, q_test += len; } - q_real = string_escape_mem(in, p, &buf, q_real, flags, esc); + q_real = string_escape_mem(in, p, out_real, out_size, flags, esc); test_string_check_buf(name, flags, in, p, out_real, q_real, out_test, q_test); + + test_string_escape_overflow(in, p, flags, esc, q_test, name); + out: kfree(in); kfree(out_real); kfree(out_test); } -static __init void test_string_escape_nomem(void) -{ - char *in = "\eb \\C\007\"\x90\r]"; - char out[64], *buf = out; - int rc = -ENOMEM, ret; - - ret = string_escape_str_any_np(in, &buf, strlen(in), NULL); - if (ret == rc) - return; - - pr_err("Test 'escape nomem' failed: got %d instead of %d\n", ret, rc); -} - static int __init test_string_helpers_init(void) { unsigned int i; @@ -342,8 +344,6 @@ static int __init test_string_helpers_init(void) for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1); - test_string_escape_nomem(); - return -EINVAL; } module_init(test_string_helpers_init); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 4da1e7aaf9d5..3a1e0843f9a2 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1235,8 +1235,12 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, len = spec.field_width < 0 ? 1 : spec.field_width; - /* Ignore the error. We print as many characters as we can */ - string_escape_mem(addr, len, &buf, end - buf, flags, NULL); + /* + * string_escape_mem() writes as many characters as it can to + * the given buffer, and returns the total size of the output + * had the buffer been big enough. + */ + buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5199bb1a017e..2928afffbb81 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1072,10 +1072,12 @@ void qword_add(char **bpp, int *lp, char *str) if (len < 0) return; - ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t"); - if (ret < 0 || ret == len) + ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t"); + if (ret >= len) { + bp += len; len = -1; - else { + } else { + bp += ret; len -= ret; *bp++ = ' '; len--; -- cgit v1.2.3 From 49e7d9df9046590ab25788cb2a0cbcf071942e0d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:31 -0700 Subject: MAINTAINERS: Use tabs consistently Consistently use a single tab after the "specifier:" type. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 70 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 190981382853..87b2644a78fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -625,16 +625,16 @@ F: drivers/iommu/amd_iommu*.[ch] F: include/linux/amd-iommu.h AMD KFD -M: Oded Gabbay -L: dri-devel@lists.freedesktop.org -T: git git://people.freedesktop.org/~gabbayo/linux.git -S: Supported -F: drivers/gpu/drm/amd/amdkfd/ +M: Oded Gabbay +L: dri-devel@lists.freedesktop.org +T: git git://people.freedesktop.org/~gabbayo/linux.git +S: Supported +F: drivers/gpu/drm/amd/amdkfd/ F: drivers/gpu/drm/amd/include/cik_structs.h F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h -F: drivers/gpu/drm/radeon/radeon_kfd.c -F: drivers/gpu/drm/radeon/radeon_kfd.h -F: include/uapi/linux/kfd_ioctl.h +F: drivers/gpu/drm/radeon/radeon_kfd.c +F: drivers/gpu/drm/radeon/radeon_kfd.h +F: include/uapi/linux/kfd_ioctl.h AMD MICROCODE UPDATE SUPPORT M: Borislav Petkov @@ -1967,10 +1967,10 @@ F: Documentation/filesystems/befs.txt F: fs/befs/ BECKHOFF CX5020 ETHERCAT MASTER DRIVER -M: Dariusz Marcinkiewicz -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/ethernet/ec_bhf.c +M: Dariusz Marcinkiewicz +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/ec_bhf.c BFS FILE SYSTEM M: "Tigran A. Aivazian" @@ -2895,11 +2895,11 @@ S: Supported F: drivers/net/ethernet/chelsio/cxgb3/ CXGB3 ISCSI DRIVER (CXGB3I) -M: Karen Xie -L: linux-scsi@vger.kernel.org -W: http://www.chelsio.com -S: Supported -F: drivers/scsi/cxgbi/cxgb3i +M: Karen Xie +L: linux-scsi@vger.kernel.org +W: http://www.chelsio.com +S: Supported +F: drivers/scsi/cxgbi/cxgb3i CXGB3 IWARP RNIC DRIVER (IW_CXGB3) M: Steve Wise @@ -2916,11 +2916,11 @@ S: Supported F: drivers/net/ethernet/chelsio/cxgb4/ CXGB4 ISCSI DRIVER (CXGB4I) -M: Karen Xie -L: linux-scsi@vger.kernel.org -W: http://www.chelsio.com -S: Supported -F: drivers/scsi/cxgbi/cxgb4i +M: Karen Xie +L: linux-scsi@vger.kernel.org +W: http://www.chelsio.com +S: Supported +F: drivers/scsi/cxgbi/cxgb4i CXGB4 IWARP RNIC DRIVER (IW_CXGB4) M: Steve Wise @@ -5222,7 +5222,7 @@ F: arch/x86/kernel/tboot.c INTEL WIRELESS WIMAX CONNECTION 2400 M: Inaky Perez-Gonzalez M: linux-wimax@intel.com -L: wimax@linuxwimax.org (subscribers-only) +L: wimax@linuxwimax.org (subscribers-only) S: Supported W: http://linuxwimax.org F: Documentation/wimax/README.i2400m @@ -5918,7 +5918,7 @@ F: arch/powerpc/platforms/512x/ F: arch/powerpc/platforms/52xx/ LINUX FOR POWERPC EMBEDDED PPC4XX -M: Alistair Popple +M: Alistair Popple M: Matt Porter W: http://www.penguinppc.org/ L: linuxppc-dev@lists.ozlabs.org @@ -6391,7 +6391,7 @@ S: Supported F: drivers/watchdog/mena21_wdt.c MEN CHAMELEON BUS (mcb) -M: Johannes Thumshirn +M: Johannes Thumshirn S: Supported F: drivers/mcb/ F: include/linux/mcb.h @@ -7947,10 +7947,10 @@ L: rtc-linux@googlegroups.com S: Maintained QAT DRIVER -M: Tadeusz Struk -L: qat-linux@intel.com -S: Supported -F: drivers/crypto/qat/ +M: Tadeusz Struk +L: qat-linux@intel.com +S: Supported +F: drivers/crypto/qat/ QIB DRIVER M: Mike Marciniszyn @@ -10120,11 +10120,11 @@ F: include/linux/cdrom.h F: include/uapi/linux/cdrom.h UNISYS S-PAR DRIVERS -M: Benjamin Romer -M: David Kershner -L: sparmaintainer@unisys.com (Unisys internal) -S: Supported -F: drivers/staging/unisys/ +M: Benjamin Romer +M: David Kershner +L: sparmaintainer@unisys.com (Unisys internal) +S: Supported +F: drivers/staging/unisys/ UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER M: Vinayak Holikatti @@ -10681,7 +10681,7 @@ F: drivers/media/rc/winbond-cir.c WIMAX STACK M: Inaky Perez-Gonzalez M: linux-wimax@intel.com -L: wimax@linuxwimax.org (subscribers-only) +L: wimax@linuxwimax.org (subscribers-only) S: Supported W: http://linuxwimax.org F: Documentation/wimax/README.wimax -- cgit v1.2.3 From 3ca7bf8756a0426e642446ae35df31a29a1b1108 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Wed, 15 Apr 2015 16:17:34 -0700 Subject: CREDITS: add Ricardo Ribalda Delgado Add personal details to CREDITS file. Signed-off-by: Ricardo Ribalda Delgado Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CREDITS b/CREDITS index 843e17647f3b..a0060bd054b5 100644 --- a/CREDITS +++ b/CREDITS @@ -3008,6 +3008,19 @@ W: http://www.qsl.net/dl1bke/ D: Generic Z8530 driver, AX.25 DAMA slave implementation D: Several AX.25 hacks +N: Ricardo Ribalda Delgado +E: ricardo.ribalda@gmail.com +W: http://ribalda.com +D: PLX USB338x driver +D: PCA9634 driver +D: Option GTM671WFS +D: Fintek F81216A +D: Various kernel hacks +S: Qtechnology A/S +S: Valby Langgade 142 +S: 2500 Valby +S: Denmark + N: Francois-Rene Rideau E: fare@tunes.org W: http://www.tunes.org/~fare -- cgit v1.2.3 From 01c7e5a50cc16174b7ccfdbcf32e9bb136f277ba Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Wed, 15 Apr 2015 16:17:37 -0700 Subject: .mailmap: add Ricardo Ribalda Work and Home computer had different settings in the mail client. Some contributions appear as Ricardo Ribalda, others as Ricardo Ribalda Delgado (and one as just Ricardo). Signed-off-by: Ricardo Ribalda Delgado Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 0d971cfb0772..6287004040e7 100644 --- a/.mailmap +++ b/.mailmap @@ -100,6 +100,7 @@ Rajesh Shah Ralf Baechle Ralf Wildenhues Rémi Denis-Courmont +Ricardo Ribalda Delgado Rudolf Marek Rui Saraiva Sachin P Sant -- cgit v1.2.3 From 8a72ed6fa7e89c5ecd68803cd1160c35b079ea3b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:39 -0700 Subject: MAINTAINERS: CREDITS: remove Stefano Brivio from B43 This email address isn't working anymore Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 4 ++++ MAINTAINERS | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index a0060bd054b5..2ef5dceef324 100644 --- a/CREDITS +++ b/CREDITS @@ -508,6 +508,10 @@ E: paul@paulbristow.net W: http://paulbristow.net/linux/idefloppy.html D: Maintainer of IDE/ATAPI floppy driver +N: Stefano Brivio +E: stefano.brivio@polimi.it +D: Broadcom B43 driver + N: Dominik Brodowski E: linux@brodo.de W: http://www.brodo.de/ diff --git a/MAINTAINERS b/MAINTAINERS index 87b2644a78fa..61740322d465 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1915,16 +1915,14 @@ S: Maintained F: drivers/media/radio/radio-aztech* B43 WIRELESS DRIVER -M: Stefano Brivio L: linux-wireless@vger.kernel.org L: b43-dev@lists.infradead.org W: http://wireless.kernel.org/en/users/Drivers/b43 -S: Maintained +S: Odd Fixes F: drivers/net/wireless/b43/ B43LEGACY WIRELESS DRIVER M: Larry Finger -M: Stefano Brivio L: linux-wireless@vger.kernel.org L: b43-dev@lists.infradead.org W: http://wireless.kernel.org/en/users/Drivers/b43 -- cgit v1.2.3 From 89c1e79eb302349fcaf0697bc9116a4ff16bfeb0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:42 -0700 Subject: linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK The macro BITMAP_LAST_WORD_MASK can be implemented without a conditional, which will generally lead to slightly better generated code (221 bytes saved for allmodconfig-GCOV_KERNEL, ~2k with GCOV_KERNEL). As a small bonus, this also ensures that the nbits parameter is expanded exactly once. In BITMAP_FIRST_WORD_MASK, if start is signed gcc is technically allowed to assume it is positive (or divisible by BITS_PER_LONG), and hence just do the simple mask. It doesn't seem to use this, and even on an architecture like x86 where the shift only depends on the lower 5 or 6 bits, and these bits are not affected by the signedness of the expression, gcc still generates code to compute the C99 mandated value of start % BITS_PER_LONG. So just use a mask explicitly, also for consistency with BITMAP_LAST_WORD_MASK. Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Reviewed-by: George Spelvin Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index dbfbf4990005..be4fa5ddf36c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -172,12 +172,8 @@ extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) -#define BITMAP_LAST_WORD_MASK(nbits) \ -( \ - ((nbits) % BITS_PER_LONG) ? \ - (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ -) +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -- cgit v1.2.3 From 3ac62bc0602794dc36aad77a7ef5772e989b2e22 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:45 -0700 Subject: x86: mtrr: if: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/if.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a041e094b8b9..d76f13d6d8d6 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = { static int mtrr_seq_show(struct seq_file *seq, void *offset) { char factor; - int i, max, len; + int i, max; mtrr_type type; unsigned long base, size; - len = 0; max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); @@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) size >>= 20 - PAGE_SHIFT; } /* Base can be > 32bit */ - len += seq_printf(seq, "reg%02i: base=0x%06lx000 " - "(%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, - factor, mtrr_usage_table[i], - mtrr_attrib_to_str(type)); + seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), + size, factor, + mtrr_usage_table[i], mtrr_attrib_to_str(type)); } return 0; } -- cgit v1.2.3 From 9f6a240e8b08d3fa711c2b615e7ea901cf59e590 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:48 -0700 Subject: power: wakeup: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/power/wakeup.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index aab7158d2afe..77262009f89d 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -843,7 +843,6 @@ static int print_wakeup_source_stats(struct seq_file *m, unsigned long active_count; ktime_t active_time; ktime_t prevent_sleep_time; - int ret; spin_lock_irqsave(&ws->lock, flags); @@ -866,17 +865,16 @@ static int print_wakeup_source_stats(struct seq_file *m, active_time = ktime_set(0, 0); } - ret = seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t" - "%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", - ws->name, active_count, ws->event_count, - ws->wakeup_count, ws->expire_count, - ktime_to_ms(active_time), ktime_to_ms(total_time), - ktime_to_ms(max_time), ktime_to_ms(ws->last_time), - ktime_to_ms(prevent_sleep_time)); + seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", + ws->name, active_count, ws->event_count, + ws->wakeup_count, ws->expire_count, + ktime_to_ms(active_time), ktime_to_ms(total_time), + ktime_to_ms(max_time), ktime_to_ms(ws->last_time), + ktime_to_ms(prevent_sleep_time)); spin_unlock_irqrestore(&ws->lock, flags); - return ret; + return 0; } /** -- cgit v1.2.3 From 4395eb1f16cc55406fe3de4546134fc61253a06b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:51 -0700 Subject: rtc: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-cmos.c | 36 +++++++++++++++++++----------------- drivers/rtc/rtc-ds1305.c | 6 +++--- drivers/rtc/rtc-mrst.c | 16 +++++++++------- drivers/rtc/rtc-tegra.c | 4 +++- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 5b2e76159b41..87647f459198 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -459,23 +459,25 @@ static int cmos_procfs(struct device *dev, struct seq_file *seq) /* NOTE: at least ICH6 reports battery status using a different * (non-RTC) bit; and SQWE is ignored on many current systems. */ - return seq_printf(seq, - "periodic_IRQ\t: %s\n" - "update_IRQ\t: %s\n" - "HPET_emulated\t: %s\n" - // "square_wave\t: %s\n" - "BCD\t\t: %s\n" - "DST_enable\t: %s\n" - "periodic_freq\t: %d\n" - "batt_status\t: %s\n", - (rtc_control & RTC_PIE) ? "yes" : "no", - (rtc_control & RTC_UIE) ? "yes" : "no", - is_hpet_enabled() ? "yes" : "no", - // (rtc_control & RTC_SQWE) ? "yes" : "no", - (rtc_control & RTC_DM_BINARY) ? "no" : "yes", - (rtc_control & RTC_DST_EN) ? "yes" : "no", - cmos->rtc->irq_freq, - (valid & RTC_VRT) ? "okay" : "dead"); + seq_printf(seq, + "periodic_IRQ\t: %s\n" + "update_IRQ\t: %s\n" + "HPET_emulated\t: %s\n" + // "square_wave\t: %s\n" + "BCD\t\t: %s\n" + "DST_enable\t: %s\n" + "periodic_freq\t: %d\n" + "batt_status\t: %s\n", + (rtc_control & RTC_PIE) ? "yes" : "no", + (rtc_control & RTC_UIE) ? "yes" : "no", + is_hpet_enabled() ? "yes" : "no", + // (rtc_control & RTC_SQWE) ? "yes" : "no", + (rtc_control & RTC_DM_BINARY) ? "no" : "yes", + (rtc_control & RTC_DST_EN) ? "yes" : "no", + cmos->rtc->irq_freq, + (valid & RTC_VRT) ? "okay" : "dead"); + + return 0; } #else diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index 129add77065d..12b07158a366 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -434,9 +434,9 @@ static int ds1305_proc(struct device *dev, struct seq_file *seq) } done: - return seq_printf(seq, - "trickle_charge\t: %s%s\n", - diodes, resistors); + seq_printf(seq, "trickle_charge\t: %s%s\n", diodes, resistors); + + return 0; } #else diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index 3a6fd3a8a2ec..548ea6f6f384 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -277,13 +277,15 @@ static int mrst_procfs(struct device *dev, struct seq_file *seq) valid = vrtc_cmos_read(RTC_VALID); spin_unlock_irq(&rtc_lock); - return seq_printf(seq, - "periodic_IRQ\t: %s\n" - "alarm\t\t: %s\n" - "BCD\t\t: no\n" - "periodic_freq\t: daily (not adjustable)\n", - (rtc_control & RTC_PIE) ? "on" : "off", - (rtc_control & RTC_AIE) ? "on" : "off"); + seq_printf(seq, + "periodic_IRQ\t: %s\n" + "alarm\t\t: %s\n" + "BCD\t\t: no\n" + "periodic_freq\t: daily (not adjustable)\n", + (rtc_control & RTC_PIE) ? "on" : "off", + (rtc_control & RTC_AIE) ? "on" : "off"); + + return 0; } #else diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c index d948277057d8..60232bd366ef 100644 --- a/drivers/rtc/rtc-tegra.c +++ b/drivers/rtc/rtc-tegra.c @@ -261,7 +261,9 @@ static int tegra_rtc_proc(struct device *dev, struct seq_file *seq) if (!dev || !dev->driver) return 0; - return seq_printf(seq, "name\t\t: %s\n", dev_name(dev)); + seq_printf(seq, "name\t\t: %s\n", dev_name(dev)); + + return 0; } static irqreturn_t tegra_rtc_irq_handler(int irq, void *data) -- cgit v1.2.3 From 7f032d6ef6154868a2a5d5f6b2c3f8587292196c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:54 -0700 Subject: ipc: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 34 ++++++++++++++++++---------------- ipc/sem.c | 26 ++++++++++++++------------ ipc/shm.c | 42 ++++++++++++++++++++++-------------------- ipc/util.c | 6 ++++-- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index a7261d5cbc89..2b6fdbb9e0e9 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1015,22 +1015,24 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) struct user_namespace *user_ns = seq_user_ns(s); struct msg_queue *msq = it; - return seq_printf(s, - "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n", - msq->q_perm.key, - msq->q_perm.id, - msq->q_perm.mode, - msq->q_cbytes, - msq->q_qnum, - msq->q_lspid, - msq->q_lrpid, - from_kuid_munged(user_ns, msq->q_perm.uid), - from_kgid_munged(user_ns, msq->q_perm.gid), - from_kuid_munged(user_ns, msq->q_perm.cuid), - from_kgid_munged(user_ns, msq->q_perm.cgid), - msq->q_stime, - msq->q_rtime, - msq->q_ctime); + seq_printf(s, + "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n", + msq->q_perm.key, + msq->q_perm.id, + msq->q_perm.mode, + msq->q_cbytes, + msq->q_qnum, + msq->q_lspid, + msq->q_lrpid, + from_kuid_munged(user_ns, msq->q_perm.uid), + from_kgid_munged(user_ns, msq->q_perm.gid), + from_kuid_munged(user_ns, msq->q_perm.cuid), + from_kgid_munged(user_ns, msq->q_perm.cgid), + msq->q_stime, + msq->q_rtime, + msq->q_ctime); + + return 0; } #endif diff --git a/ipc/sem.c b/ipc/sem.c index 92842113c6a9..d1a6edd17eba 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2170,17 +2170,19 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) sem_otime = get_semotime(sma); - return seq_printf(s, - "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n", - sma->sem_perm.key, - sma->sem_perm.id, - sma->sem_perm.mode, - sma->sem_nsems, - from_kuid_munged(user_ns, sma->sem_perm.uid), - from_kgid_munged(user_ns, sma->sem_perm.gid), - from_kuid_munged(user_ns, sma->sem_perm.cuid), - from_kgid_munged(user_ns, sma->sem_perm.cgid), - sem_otime, - sma->sem_ctime); + seq_printf(s, + "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n", + sma->sem_perm.key, + sma->sem_perm.id, + sma->sem_perm.mode, + sma->sem_nsems, + from_kuid_munged(user_ns, sma->sem_perm.uid), + from_kgid_munged(user_ns, sma->sem_perm.gid), + from_kuid_munged(user_ns, sma->sem_perm.cuid), + from_kgid_munged(user_ns, sma->sem_perm.cgid), + sem_otime, + sma->sem_ctime); + + return 0; } #endif diff --git a/ipc/shm.c b/ipc/shm.c index 19633b4a2350..d280a74af2ef 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1342,25 +1342,27 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it) #define SIZE_SPEC "%21lu" #endif - return seq_printf(s, - "%10d %10d %4o " SIZE_SPEC " %5u %5u " - "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " - SIZE_SPEC " " SIZE_SPEC "\n", - shp->shm_perm.key, - shp->shm_perm.id, - shp->shm_perm.mode, - shp->shm_segsz, - shp->shm_cprid, - shp->shm_lprid, - shp->shm_nattch, - from_kuid_munged(user_ns, shp->shm_perm.uid), - from_kgid_munged(user_ns, shp->shm_perm.gid), - from_kuid_munged(user_ns, shp->shm_perm.cuid), - from_kgid_munged(user_ns, shp->shm_perm.cgid), - shp->shm_atim, - shp->shm_dtim, - shp->shm_ctim, - rss * PAGE_SIZE, - swp * PAGE_SIZE); + seq_printf(s, + "%10d %10d %4o " SIZE_SPEC " %5u %5u " + "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " + SIZE_SPEC " " SIZE_SPEC "\n", + shp->shm_perm.key, + shp->shm_perm.id, + shp->shm_perm.mode, + shp->shm_segsz, + shp->shm_cprid, + shp->shm_lprid, + shp->shm_nattch, + from_kuid_munged(user_ns, shp->shm_perm.uid), + from_kgid_munged(user_ns, shp->shm_perm.gid), + from_kuid_munged(user_ns, shp->shm_perm.cuid), + from_kgid_munged(user_ns, shp->shm_perm.cgid), + shp->shm_atim, + shp->shm_dtim, + shp->shm_ctim, + rss * PAGE_SIZE, + swp * PAGE_SIZE); + + return 0; } #endif diff --git a/ipc/util.c b/ipc/util.c index 106bed0378ab..ff3323ef8d8b 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -837,8 +837,10 @@ static int sysvipc_proc_show(struct seq_file *s, void *it) struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; - if (it == SEQ_START_TOKEN) - return seq_puts(s, iface->header); + if (it == SEQ_START_TOKEN) { + seq_puts(s, iface->header); + return 0; + } return iface->show(s, it); } -- cgit v1.2.3 From 81f0cd97aaf20b0df97ec888d90aa46f5279282b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:17:57 -0700 Subject: microblaze: mb: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Acked-by: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/kernel/cpu/mb.c | 149 ++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 76 deletions(-) diff --git a/arch/microblaze/kernel/cpu/mb.c b/arch/microblaze/kernel/cpu/mb.c index 7b5dca7ed39d..9581d194d9e4 100644 --- a/arch/microblaze/kernel/cpu/mb.c +++ b/arch/microblaze/kernel/cpu/mb.c @@ -27,7 +27,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) { - int count = 0; char *fpga_family = "Unknown"; char *cpu_ver = "Unknown"; int i; @@ -48,91 +47,89 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } - count = seq_printf(m, - "CPU-Family: MicroBlaze\n" - "FPGA-Arch: %s\n" - "CPU-Ver: %s, %s endian\n" - "CPU-MHz: %d.%02d\n" - "BogoMips: %lu.%02lu\n", - fpga_family, - cpu_ver, - cpuinfo.endian ? "little" : "big", - cpuinfo.cpu_clock_freq / - 1000000, - cpuinfo.cpu_clock_freq % - 1000000, - loops_per_jiffy / (500000 / HZ), - (loops_per_jiffy / (5000 / HZ)) % 100); - - count += seq_printf(m, - "HW:\n Shift:\t\t%s\n" - " MSR:\t\t%s\n" - " PCMP:\t\t%s\n" - " DIV:\t\t%s\n", - (cpuinfo.use_instr & PVR0_USE_BARREL_MASK) ? "yes" : "no", - (cpuinfo.use_instr & PVR2_USE_MSR_INSTR) ? "yes" : "no", - (cpuinfo.use_instr & PVR2_USE_PCMP_INSTR) ? "yes" : "no", - (cpuinfo.use_instr & PVR0_USE_DIV_MASK) ? "yes" : "no"); - - count += seq_printf(m, - " MMU:\t\t%x\n", - cpuinfo.mmu); - - count += seq_printf(m, - " MUL:\t\t%s\n" - " FPU:\t\t%s\n", - (cpuinfo.use_mult & PVR2_USE_MUL64_MASK) ? "v2" : - (cpuinfo.use_mult & PVR0_USE_HW_MUL_MASK) ? "v1" : "no", - (cpuinfo.use_fpu & PVR2_USE_FPU2_MASK) ? "v2" : - (cpuinfo.use_fpu & PVR0_USE_FPU_MASK) ? "v1" : "no"); - - count += seq_printf(m, - " Exc:\t\t%s%s%s%s%s%s%s%s\n", - (cpuinfo.use_exc & PVR2_OPCODE_0x0_ILL_MASK) ? "op0x0 " : "", - (cpuinfo.use_exc & PVR2_UNALIGNED_EXC_MASK) ? "unal " : "", - (cpuinfo.use_exc & PVR2_ILL_OPCODE_EXC_MASK) ? "ill " : "", - (cpuinfo.use_exc & PVR2_IOPB_BUS_EXC_MASK) ? "iopb " : "", - (cpuinfo.use_exc & PVR2_DOPB_BUS_EXC_MASK) ? "dopb " : "", - (cpuinfo.use_exc & PVR2_DIV_ZERO_EXC_MASK) ? "zero " : "", - (cpuinfo.use_exc & PVR2_FPU_EXC_MASK) ? "fpu " : "", - (cpuinfo.use_exc & PVR2_USE_FSL_EXC) ? "fsl " : ""); - - count += seq_printf(m, - "Stream-insns:\t%sprivileged\n", - cpuinfo.mmu_privins ? "un" : ""); + seq_printf(m, + "CPU-Family: MicroBlaze\n" + "FPGA-Arch: %s\n" + "CPU-Ver: %s, %s endian\n" + "CPU-MHz: %d.%02d\n" + "BogoMips: %lu.%02lu\n", + fpga_family, + cpu_ver, + cpuinfo.endian ? "little" : "big", + cpuinfo.cpu_clock_freq / 1000000, + cpuinfo.cpu_clock_freq % 1000000, + loops_per_jiffy / (500000 / HZ), + (loops_per_jiffy / (5000 / HZ)) % 100); + + seq_printf(m, + "HW:\n Shift:\t\t%s\n" + " MSR:\t\t%s\n" + " PCMP:\t\t%s\n" + " DIV:\t\t%s\n", + (cpuinfo.use_instr & PVR0_USE_BARREL_MASK) ? "yes" : "no", + (cpuinfo.use_instr & PVR2_USE_MSR_INSTR) ? "yes" : "no", + (cpuinfo.use_instr & PVR2_USE_PCMP_INSTR) ? "yes" : "no", + (cpuinfo.use_instr & PVR0_USE_DIV_MASK) ? "yes" : "no"); + + seq_printf(m, " MMU:\t\t%x\n", cpuinfo.mmu); + + seq_printf(m, + " MUL:\t\t%s\n" + " FPU:\t\t%s\n", + (cpuinfo.use_mult & PVR2_USE_MUL64_MASK) ? "v2" : + (cpuinfo.use_mult & PVR0_USE_HW_MUL_MASK) ? "v1" : "no", + (cpuinfo.use_fpu & PVR2_USE_FPU2_MASK) ? "v2" : + (cpuinfo.use_fpu & PVR0_USE_FPU_MASK) ? "v1" : "no"); + + seq_printf(m, + " Exc:\t\t%s%s%s%s%s%s%s%s\n", + (cpuinfo.use_exc & PVR2_OPCODE_0x0_ILL_MASK) ? "op0x0 " : "", + (cpuinfo.use_exc & PVR2_UNALIGNED_EXC_MASK) ? "unal " : "", + (cpuinfo.use_exc & PVR2_ILL_OPCODE_EXC_MASK) ? "ill " : "", + (cpuinfo.use_exc & PVR2_IOPB_BUS_EXC_MASK) ? "iopb " : "", + (cpuinfo.use_exc & PVR2_DOPB_BUS_EXC_MASK) ? "dopb " : "", + (cpuinfo.use_exc & PVR2_DIV_ZERO_EXC_MASK) ? "zero " : "", + (cpuinfo.use_exc & PVR2_FPU_EXC_MASK) ? "fpu " : "", + (cpuinfo.use_exc & PVR2_USE_FSL_EXC) ? "fsl " : ""); + + seq_printf(m, + "Stream-insns:\t%sprivileged\n", + cpuinfo.mmu_privins ? "un" : ""); if (cpuinfo.use_icache) - count += seq_printf(m, - "Icache:\t\t%ukB\tline length:\t%dB\n", - cpuinfo.icache_size >> 10, - cpuinfo.icache_line_length); + seq_printf(m, + "Icache:\t\t%ukB\tline length:\t%dB\n", + cpuinfo.icache_size >> 10, + cpuinfo.icache_line_length); else - count += seq_printf(m, "Icache:\t\tno\n"); + seq_puts(m, "Icache:\t\tno\n"); if (cpuinfo.use_dcache) { - count += seq_printf(m, - "Dcache:\t\t%ukB\tline length:\t%dB\n", - cpuinfo.dcache_size >> 10, - cpuinfo.dcache_line_length); - seq_printf(m, "Dcache-Policy:\t"); + seq_printf(m, + "Dcache:\t\t%ukB\tline length:\t%dB\n", + cpuinfo.dcache_size >> 10, + cpuinfo.dcache_line_length); + seq_puts(m, "Dcache-Policy:\t"); if (cpuinfo.dcache_wb) - count += seq_printf(m, "write-back\n"); + seq_puts(m, "write-back\n"); else - count += seq_printf(m, "write-through\n"); - } else - count += seq_printf(m, "Dcache:\t\tno\n"); + seq_puts(m, "write-through\n"); + } else { + seq_puts(m, "Dcache:\t\tno\n"); + } + + seq_printf(m, + "HW-Debug:\t%s\n", + cpuinfo.hw_debug ? "yes" : "no"); - count += seq_printf(m, - "HW-Debug:\t%s\n", - cpuinfo.hw_debug ? "yes" : "no"); + seq_printf(m, + "PVR-USR1:\t%02x\n" + "PVR-USR2:\t%08x\n", + cpuinfo.pvr_user1, + cpuinfo.pvr_user2); - count += seq_printf(m, - "PVR-USR1:\t%02x\n" - "PVR-USR2:\t%08x\n", - cpuinfo.pvr_user1, - cpuinfo.pvr_user2); + seq_printf(m, "Page size:\t%lu\n", PAGE_SIZE); - count += seq_printf(m, "Page size:\t%lu\n", PAGE_SIZE); return 0; } -- cgit v1.2.3 From 4122669e5266a2af8a84a499cf7821786ed7d2cf Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:00 -0700 Subject: nios2: cpuinfo: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Ley Foon Tan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/nios2/kernel/cpuinfo.c | 77 ++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index a223691dff4f..1d96de0bd4aa 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -126,47 +126,46 @@ void __init setup_cpuinfo(void) */ static int show_cpuinfo(struct seq_file *m, void *v) { - int count = 0; const u32 clockfreq = cpuinfo.cpu_clock_freq; - count = seq_printf(m, - "CPU:\t\tNios II/%s\n" - "MMU:\t\t%s\n" - "FPU:\t\tnone\n" - "Clocking:\t%u.%02u MHz\n" - "BogoMips:\t%lu.%02lu\n" - "Calibration:\t%lu loops\n", - cpuinfo.cpu_impl, - cpuinfo.mmu ? "present" : "none", - clockfreq / 1000000, (clockfreq / 100000) % 10, - (loops_per_jiffy * HZ) / 500000, - ((loops_per_jiffy * HZ) / 5000) % 100, - (loops_per_jiffy * HZ)); - - count += seq_printf(m, - "HW:\n" - " MUL:\t\t%s\n" - " MULX:\t\t%s\n" - " DIV:\t\t%s\n", - cpuinfo.has_mul ? "yes" : "no", - cpuinfo.has_mulx ? "yes" : "no", - cpuinfo.has_div ? "yes" : "no"); - - count += seq_printf(m, - "Icache:\t\t%ukB, line length: %u\n", - cpuinfo.icache_size >> 10, - cpuinfo.icache_line_size); - - count += seq_printf(m, - "Dcache:\t\t%ukB, line length: %u\n", - cpuinfo.dcache_size >> 10, - cpuinfo.dcache_line_size); - - count += seq_printf(m, - "TLB:\t\t%u ways, %u entries, %u PID bits\n", - cpuinfo.tlb_num_ways, - cpuinfo.tlb_num_entries, - cpuinfo.tlb_pid_num_bits); + seq_printf(m, + "CPU:\t\tNios II/%s\n" + "MMU:\t\t%s\n" + "FPU:\t\tnone\n" + "Clocking:\t%u.%02u MHz\n" + "BogoMips:\t%lu.%02lu\n" + "Calibration:\t%lu loops\n", + cpuinfo.cpu_impl, + cpuinfo.mmu ? "present" : "none", + clockfreq / 1000000, (clockfreq / 100000) % 10, + (loops_per_jiffy * HZ) / 500000, + ((loops_per_jiffy * HZ) / 5000) % 100, + (loops_per_jiffy * HZ)); + + seq_printf(m, + "HW:\n" + " MUL:\t\t%s\n" + " MULX:\t\t%s\n" + " DIV:\t\t%s\n", + cpuinfo.has_mul ? "yes" : "no", + cpuinfo.has_mulx ? "yes" : "no", + cpuinfo.has_div ? "yes" : "no"); + + seq_printf(m, + "Icache:\t\t%ukB, line length: %u\n", + cpuinfo.icache_size >> 10, + cpuinfo.icache_line_size); + + seq_printf(m, + "Dcache:\t\t%ukB, line length: %u\n", + cpuinfo.dcache_size >> 10, + cpuinfo.dcache_line_size); + + seq_printf(m, + "TLB:\t\t%u ways, %u entries, %u PID bits\n", + cpuinfo.tlb_num_ways, + cpuinfo.tlb_num_entries, + cpuinfo.tlb_pid_num_bits); return 0; } -- cgit v1.2.3 From cd2b2937c6ae7f8d562d7e08e06da70e778d0323 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:02 -0700 Subject: ARM: plat-pxa: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, (as it is here, it doesn't return # of chars emitted) will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/plat-pxa/dma.c | 111 +++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 58 deletions(-) diff --git a/arch/arm/plat-pxa/dma.c b/arch/arm/plat-pxa/dma.c index 054fc5a1a11c..d92f07f6ecfb 100644 --- a/arch/arm/plat-pxa/dma.c +++ b/arch/arm/plat-pxa/dma.c @@ -51,19 +51,19 @@ static struct dentry *dbgfs_root, *dbgfs_state, **dbgfs_chan; static int dbg_show_requester_chan(struct seq_file *s, void *p) { - int pos = 0; int chan = (int)s->private; int i; u32 drcmr; - pos += seq_printf(s, "DMA channel %d requesters list :\n", chan); + seq_printf(s, "DMA channel %d requesters list :\n", chan); for (i = 0; i < DMA_MAX_REQUESTERS; i++) { drcmr = DRCMR(i); if ((drcmr & DRCMR_CHLNUM) == chan) - pos += seq_printf(s, "\tRequester %d (MAPVLD=%d)\n", i, - !!(drcmr & DRCMR_MAPVLD)); + seq_printf(s, "\tRequester %d (MAPVLD=%d)\n", + i, !!(drcmr & DRCMR_MAPVLD)); } - return pos; + + return 0; } static inline int dbg_burst_from_dcmd(u32 dcmd) @@ -83,7 +83,6 @@ static int is_phys_valid(unsigned long addr) static int dbg_show_descriptors(struct seq_file *s, void *p) { - int pos = 0; int chan = (int)s->private; int i, max_show = 20, burst, width; u32 dcmd; @@ -94,44 +93,43 @@ static int dbg_show_descriptors(struct seq_file *s, void *p) spin_lock_irqsave(&dma_channels[chan].lock, flags); phys_desc = DDADR(chan); - pos += seq_printf(s, "DMA channel %d descriptors :\n", chan); - pos += seq_printf(s, "[%03d] First descriptor unknown\n", 0); + seq_printf(s, "DMA channel %d descriptors :\n", chan); + seq_printf(s, "[%03d] First descriptor unknown\n", 0); for (i = 1; i < max_show && is_phys_valid(phys_desc); i++) { desc = phys_to_virt(phys_desc); dcmd = desc->dcmd; burst = dbg_burst_from_dcmd(dcmd); width = (1 << ((dcmd >> 14) & 0x3)) >> 1; - pos += seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n", - i, phys_desc, desc); - pos += seq_printf(s, "\tDDADR = %08x\n", desc->ddadr); - pos += seq_printf(s, "\tDSADR = %08x\n", desc->dsadr); - pos += seq_printf(s, "\tDTADR = %08x\n", desc->dtadr); - pos += seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d" - " width=%d len=%d)\n", - dcmd, - DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR), - DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG), - DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN), - DCMD_STR(ENDIAN), burst, width, - dcmd & DCMD_LENGTH); + seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n", + i, phys_desc, desc); + seq_printf(s, "\tDDADR = %08x\n", desc->ddadr); + seq_printf(s, "\tDSADR = %08x\n", desc->dsadr); + seq_printf(s, "\tDTADR = %08x\n", desc->dtadr); + seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n", + dcmd, + DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR), + DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG), + DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN), + DCMD_STR(ENDIAN), burst, width, + dcmd & DCMD_LENGTH); phys_desc = desc->ddadr; } if (i == max_show) - pos += seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n", - i, phys_desc); + seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n", + i, phys_desc); else - pos += seq_printf(s, "[%03d] Desc at %08lx is %s\n", - i, phys_desc, phys_desc == DDADR_STOP ? - "DDADR_STOP" : "invalid"); + seq_printf(s, "[%03d] Desc at %08lx is %s\n", + i, phys_desc, phys_desc == DDADR_STOP ? + "DDADR_STOP" : "invalid"); spin_unlock_irqrestore(&dma_channels[chan].lock, flags); - return pos; + + return 0; } static int dbg_show_chan_state(struct seq_file *s, void *p) { - int pos = 0; int chan = (int)s->private; u32 dcsr, dcmd; int burst, width; @@ -142,42 +140,39 @@ static int dbg_show_chan_state(struct seq_file *s, void *p) burst = dbg_burst_from_dcmd(dcmd); width = (1 << ((dcmd >> 14) & 0x3)) >> 1; - pos += seq_printf(s, "DMA channel %d\n", chan); - pos += seq_printf(s, "\tPriority : %s\n", - str_prio[dma_channels[chan].prio]); - pos += seq_printf(s, "\tUnaligned transfer bit: %s\n", - DALGN & (1 << chan) ? "yes" : "no"); - pos += seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", - dcsr, DCSR_STR(RUN), DCSR_STR(NODESC), - DCSR_STR(STOPIRQEN), DCSR_STR(EORIRQEN), - DCSR_STR(EORJMPEN), DCSR_STR(EORSTOPEN), - DCSR_STR(SETCMPST), DCSR_STR(CLRCMPST), - DCSR_STR(CMPST), DCSR_STR(EORINTR), DCSR_STR(REQPEND), - DCSR_STR(STOPSTATE), DCSR_STR(ENDINTR), - DCSR_STR(STARTINTR), DCSR_STR(BUSERR)); - - pos += seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d" - " len=%d)\n", - dcmd, - DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR), - DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG), - DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN), - DCMD_STR(ENDIAN), burst, width, dcmd & DCMD_LENGTH); - pos += seq_printf(s, "\tDSADR = %08x\n", DSADR(chan)); - pos += seq_printf(s, "\tDTADR = %08x\n", DTADR(chan)); - pos += seq_printf(s, "\tDDADR = %08x\n", DDADR(chan)); - return pos; + seq_printf(s, "DMA channel %d\n", chan); + seq_printf(s, "\tPriority : %s\n", str_prio[dma_channels[chan].prio]); + seq_printf(s, "\tUnaligned transfer bit: %s\n", + DALGN & (1 << chan) ? "yes" : "no"); + seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", + dcsr, DCSR_STR(RUN), DCSR_STR(NODESC), + DCSR_STR(STOPIRQEN), DCSR_STR(EORIRQEN), + DCSR_STR(EORJMPEN), DCSR_STR(EORSTOPEN), + DCSR_STR(SETCMPST), DCSR_STR(CLRCMPST), + DCSR_STR(CMPST), DCSR_STR(EORINTR), DCSR_STR(REQPEND), + DCSR_STR(STOPSTATE), DCSR_STR(ENDINTR), + DCSR_STR(STARTINTR), DCSR_STR(BUSERR)); + + seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n", + dcmd, + DCMD_STR(INCSRCADDR), DCMD_STR(INCTRGADDR), + DCMD_STR(FLOWSRC), DCMD_STR(FLOWTRG), + DCMD_STR(STARTIRQEN), DCMD_STR(ENDIRQEN), + DCMD_STR(ENDIAN), burst, width, dcmd & DCMD_LENGTH); + seq_printf(s, "\tDSADR = %08x\n", DSADR(chan)); + seq_printf(s, "\tDTADR = %08x\n", DTADR(chan)); + seq_printf(s, "\tDDADR = %08x\n", DDADR(chan)); + + return 0; } static int dbg_show_state(struct seq_file *s, void *p) { - int pos = 0; - /* basic device status */ - pos += seq_printf(s, "DMA engine status\n"); - pos += seq_printf(s, "\tChannel number: %d\n", num_dma_channels); + seq_puts(s, "DMA engine status\n"); + seq_printf(s, "\tChannel number: %d\n", num_dma_channels); - return pos; + return 0; } #define DBGFS_FUNC_DECL(name) \ -- cgit v1.2.3 From 58a1aa7c83f4ee73d96efd50c4c16ebbeb64622e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:05 -0700 Subject: openrisc: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Jonas Bonn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/openrisc/kernel/setup.c | 50 +++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 4fc7ccc0a2cf..b4ed8b36e078 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -329,30 +329,32 @@ static int show_cpuinfo(struct seq_file *m, void *v) version = (vr & SPR_VR_VER) >> 24; revision = vr & SPR_VR_REV; - return seq_printf(m, - "cpu\t\t: OpenRISC-%x\n" - "revision\t: %d\n" - "frequency\t: %ld\n" - "dcache size\t: %d bytes\n" - "dcache block size\t: %d bytes\n" - "icache size\t: %d bytes\n" - "icache block size\t: %d bytes\n" - "immu\t\t: %d entries, %lu ways\n" - "dmmu\t\t: %d entries, %lu ways\n" - "bogomips\t: %lu.%02lu\n", - version, - revision, - loops_per_jiffy * HZ, - cpuinfo.dcache_size, - cpuinfo.dcache_block_size, - cpuinfo.icache_size, - cpuinfo.icache_block_size, - 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), - 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW), - 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2), - 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW), - (loops_per_jiffy * HZ) / 500000, - ((loops_per_jiffy * HZ) / 5000) % 100); + seq_printf(m, + "cpu\t\t: OpenRISC-%x\n" + "revision\t: %d\n" + "frequency\t: %ld\n" + "dcache size\t: %d bytes\n" + "dcache block size\t: %d bytes\n" + "icache size\t: %d bytes\n" + "icache block size\t: %d bytes\n" + "immu\t\t: %d entries, %lu ways\n" + "dmmu\t\t: %d entries, %lu ways\n" + "bogomips\t: %lu.%02lu\n", + version, + revision, + loops_per_jiffy * HZ, + cpuinfo.dcache_size, + cpuinfo.dcache_block_size, + cpuinfo.icache_size, + cpuinfo.icache_block_size, + 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), + 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW), + 1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2), + 1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW), + (loops_per_jiffy * HZ) / 500000, + ((loops_per_jiffy * HZ) / 5000) % 100); + + return 0; } static void *c_start(struct seq_file *m, loff_t * pos) -- cgit v1.2.3 From 1336d4221dee528b5a0e92d2b1322790c4df1b53 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:08 -0700 Subject: cris: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Acked-by: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/arch-v10/kernel/setup.c | 58 ++++++++++++++++++------------------ arch/cris/arch-v32/kernel/setup.c | 62 ++++++++++++++++++++------------------- 2 files changed, 62 insertions(+), 58 deletions(-) diff --git a/arch/cris/arch-v10/kernel/setup.c b/arch/cris/arch-v10/kernel/setup.c index 4f96d71b5154..7ab31f1c7540 100644 --- a/arch/cris/arch-v10/kernel/setup.c +++ b/arch/cris/arch-v10/kernel/setup.c @@ -63,35 +63,37 @@ int show_cpuinfo(struct seq_file *m, void *v) else info = &cpu_info[revision]; - return seq_printf(m, - "processor\t: 0\n" - "cpu\t\t: CRIS\n" - "cpu revision\t: %lu\n" - "cpu model\t: %s\n" - "cache size\t: %d kB\n" - "fpu\t\t: %s\n" - "mmu\t\t: %s\n" - "mmu DMA bug\t: %s\n" - "ethernet\t: %s Mbps\n" - "token ring\t: %s\n" - "scsi\t\t: %s\n" - "ata\t\t: %s\n" - "usb\t\t: %s\n" - "bogomips\t: %lu.%02lu\n", + seq_printf(m, + "processor\t: 0\n" + "cpu\t\t: CRIS\n" + "cpu revision\t: %lu\n" + "cpu model\t: %s\n" + "cache size\t: %d kB\n" + "fpu\t\t: %s\n" + "mmu\t\t: %s\n" + "mmu DMA bug\t: %s\n" + "ethernet\t: %s Mbps\n" + "token ring\t: %s\n" + "scsi\t\t: %s\n" + "ata\t\t: %s\n" + "usb\t\t: %s\n" + "bogomips\t: %lu.%02lu\n", - revision, - info->model, - info->cache, - info->flags & HAS_FPU ? "yes" : "no", - info->flags & HAS_MMU ? "yes" : "no", - info->flags & HAS_MMU_BUG ? "yes" : "no", - info->flags & HAS_ETHERNET100 ? "10/100" : "10", - info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no", - info->flags & HAS_SCSI ? "yes" : "no", - info->flags & HAS_ATA ? "yes" : "no", - info->flags & HAS_USB ? "yes" : "no", - (loops_per_jiffy * HZ + 500) / 500000, - ((loops_per_jiffy * HZ + 500) / 5000) % 100); + revision, + info->model, + info->cache, + info->flags & HAS_FPU ? "yes" : "no", + info->flags & HAS_MMU ? "yes" : "no", + info->flags & HAS_MMU_BUG ? "yes" : "no", + info->flags & HAS_ETHERNET100 ? "10/100" : "10", + info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no", + info->flags & HAS_SCSI ? "yes" : "no", + info->flags & HAS_ATA ? "yes" : "no", + info->flags & HAS_USB ? "yes" : "no", + (loops_per_jiffy * HZ + 500) / 500000, + ((loops_per_jiffy * HZ + 500) / 5000) % 100); + + return 0; } #endif /* CONFIG_PROC_FS */ diff --git a/arch/cris/arch-v32/kernel/setup.c b/arch/cris/arch-v32/kernel/setup.c index 61e10ae65296..81715c683baf 100644 --- a/arch/cris/arch-v32/kernel/setup.c +++ b/arch/cris/arch-v32/kernel/setup.c @@ -77,36 +77,38 @@ int show_cpuinfo(struct seq_file *m, void *v) } } - return seq_printf(m, - "processor\t: %d\n" - "cpu\t\t: CRIS\n" - "cpu revision\t: %lu\n" - "cpu model\t: %s\n" - "cache size\t: %d KB\n" - "fpu\t\t: %s\n" - "mmu\t\t: %s\n" - "mmu DMA bug\t: %s\n" - "ethernet\t: %s Mbps\n" - "token ring\t: %s\n" - "scsi\t\t: %s\n" - "ata\t\t: %s\n" - "usb\t\t: %s\n" - "bogomips\t: %lu.%02lu\n\n", - - cpu, - revision, - info->cpu_model, - info->cache_size, - info->flags & HAS_FPU ? "yes" : "no", - info->flags & HAS_MMU ? "yes" : "no", - info->flags & HAS_MMU_BUG ? "yes" : "no", - info->flags & HAS_ETHERNET100 ? "10/100" : "10", - info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no", - info->flags & HAS_SCSI ? "yes" : "no", - info->flags & HAS_ATA ? "yes" : "no", - info->flags & HAS_USB ? "yes" : "no", - (loops_per_jiffy * HZ + 500) / 500000, - ((loops_per_jiffy * HZ + 500) / 5000) % 100); + seq_printf(m, + "processor\t: %d\n" + "cpu\t\t: CRIS\n" + "cpu revision\t: %lu\n" + "cpu model\t: %s\n" + "cache size\t: %d KB\n" + "fpu\t\t: %s\n" + "mmu\t\t: %s\n" + "mmu DMA bug\t: %s\n" + "ethernet\t: %s Mbps\n" + "token ring\t: %s\n" + "scsi\t\t: %s\n" + "ata\t\t: %s\n" + "usb\t\t: %s\n" + "bogomips\t: %lu.%02lu\n\n", + + cpu, + revision, + info->cpu_model, + info->cache_size, + info->flags & HAS_FPU ? "yes" : "no", + info->flags & HAS_MMU ? "yes" : "no", + info->flags & HAS_MMU_BUG ? "yes" : "no", + info->flags & HAS_ETHERNET100 ? "10/100" : "10", + info->flags & HAS_TOKENRING ? "4/16 Mbps" : "no", + info->flags & HAS_SCSI ? "yes" : "no", + info->flags & HAS_ATA ? "yes" : "no", + info->flags & HAS_USB ? "yes" : "no", + (loops_per_jiffy * HZ + 500) / 500000, + ((loops_per_jiffy * HZ + 500) / 5000) % 100); + + return 0; } #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From dc640a8813c0015e5a620d41e47df94c9879749d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:11 -0700 Subject: cris fasttimer: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Miscellanea: o Coalesce formats, realign arguments Signed-off-by: Joe Perches Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/arch-v10/kernel/fasttimer.c | 85 ++++++++++++++++------------------- arch/cris/arch-v32/kernel/fasttimer.c | 85 ++++++++++++++++------------------- 2 files changed, 78 insertions(+), 92 deletions(-) diff --git a/arch/cris/arch-v10/kernel/fasttimer.c b/arch/cris/arch-v10/kernel/fasttimer.c index 48a59afbeeb1..e9298739d72e 100644 --- a/arch/cris/arch-v10/kernel/fasttimer.c +++ b/arch/cris/arch-v10/kernel/fasttimer.c @@ -527,7 +527,8 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) i = debug_log_cnt; while (i != end_i || debug_log_cnt_wrapped) { - if (seq_printf(m, debug_log_string[i], debug_log_value[i]) < 0) + seq_printf(m, debug_log_string[i], debug_log_value[i]); + if (seq_has_overflowed(m)) return 0; i = (i+1) % DEBUG_LOG_MAX; } @@ -542,24 +543,22 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) int cur = (fast_timers_started - i - 1) % NUM_TIMER_STATS; #if 1 //ndef FAST_TIMER_LOG - seq_printf(m, "div: %i freq: %i delay: %i" - "\n", + seq_printf(m, "div: %i freq: %i delay: %i\n", timer_div_settings[cur], timer_freq_settings[cur], timer_delay_settings[cur]); #endif #ifdef FAST_TIMER_LOG t = &timer_started_log[cur]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; #endif } @@ -571,16 +570,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) seq_printf(m, "Timers added: %i\n", fast_timers_added); for (i = 0; i < num_to_show; i++) { t = &timer_added_log[(fast_timers_added - i - 1) % NUM_TIMER_STATS]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; } seq_putc(m, '\n'); @@ -590,16 +588,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) seq_printf(m, "Timers expired: %i\n", fast_timers_expired); for (i = 0; i < num_to_show; i++) { t = &timer_expired_log[(fast_timers_expired - i - 1) % NUM_TIMER_STATS]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; } seq_putc(m, '\n'); @@ -611,19 +608,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) while (t) { nextt = t->next; local_irq_restore(flags); - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" -/* " func: 0x%08lX" */ - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data -/* , t->function */ - ) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; local_irq_save(flags); if (t->next != nextt) diff --git a/arch/cris/arch-v32/kernel/fasttimer.c b/arch/cris/arch-v32/kernel/fasttimer.c index b130c2c5fdd8..5c84dbb99f30 100644 --- a/arch/cris/arch-v32/kernel/fasttimer.c +++ b/arch/cris/arch-v32/kernel/fasttimer.c @@ -501,7 +501,8 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) i = debug_log_cnt; while ((i != end_i || debug_log_cnt_wrapped)) { - if (seq_printf(m, debug_log_string[i], debug_log_value[i]) < 0) + seq_printf(m, debug_log_string[i], debug_log_value[i]); + if (seq_has_overflowed(m)) return 0; i = (i+1) % DEBUG_LOG_MAX; } @@ -516,23 +517,21 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) int cur = (fast_timers_started - i - 1) % NUM_TIMER_STATS; #if 1 //ndef FAST_TIMER_LOG - seq_printf(m, "div: %i delay: %i" - "\n", + seq_printf(m, "div: %i delay: %i\n", timer_div_settings[cur], timer_delay_settings[cur]); #endif #ifdef FAST_TIMER_LOG t = &timer_started_log[cur]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; #endif } @@ -544,16 +543,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) seq_printf(m, "Timers added: %i\n", fast_timers_added); for (i = 0; i < num_to_show; i++) { t = &timer_added_log[(fast_timers_added - i - 1) % NUM_TIMER_STATS]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; } seq_putc(m, '\n'); @@ -563,16 +561,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) seq_printf(m, "Timers expired: %i\n", fast_timers_expired); for (i = 0; i < num_to_show; i++){ t = &timer_expired_log[(fast_timers_expired - i - 1) % NUM_TIMER_STATS]; - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; } seq_putc(m, '\n'); @@ -584,19 +581,15 @@ static int proc_fasttimer_show(struct seq_file *m, void *v) while (t != NULL){ nextt = t->next; local_irq_restore(flags); - if (seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu " - "d: %6li us data: 0x%08lX" -/* " func: 0x%08lX" */ - "\n", - t->name, - (unsigned long)t->tv_set.tv_jiff, - (unsigned long)t->tv_set.tv_usec, - (unsigned long)t->tv_expires.tv_jiff, - (unsigned long)t->tv_expires.tv_usec, - t->delay_us, - t->data -/* , t->function */ - ) < 0) + seq_printf(m, "%-14s s: %6lu.%06lu e: %6lu.%06lu d: %6li us data: 0x%08lX\n", + t->name, + (unsigned long)t->tv_set.tv_jiff, + (unsigned long)t->tv_set.tv_usec, + (unsigned long)t->tv_expires.tv_jiff, + (unsigned long)t->tv_expires.tv_usec, + t->delay_us, + t->data); + if (seq_has_overflowed(m)) return 0; local_irq_save(flags); if (t->next != nextt) -- cgit v1.2.3 From c2f0b61d8969adf0dfb11aea7b700740fde6420b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:14 -0700 Subject: s390: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Acked-by: Sebastian Ott Cc: Gerald Schaefer Cc: Peter Oberparleiter Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Linus Torvalds --- arch/s390/pci/pci_debug.c | 6 ++++-- drivers/s390/cio/blacklist.c | 12 +++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 3229a2e570df..c22d4402ae45 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -45,8 +45,10 @@ static int pci_perf_show(struct seq_file *m, void *v) if (!zdev) return 0; - if (!zdev->fmb) - return seq_printf(m, "FMB statistics disabled\n"); + if (!zdev->fmb) { + seq_puts(m, "FMB statistics disabled\n"); + return 0; + } /* header */ seq_printf(m, "FMB @ %p\n", zdev->fmb); diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c index b3f791b2c1f8..20314aad7ab7 100644 --- a/drivers/s390/cio/blacklist.c +++ b/drivers/s390/cio/blacklist.c @@ -330,18 +330,20 @@ cio_ignore_proc_seq_show(struct seq_file *s, void *it) if (!iter->in_range) { /* First device in range. */ if ((iter->devno == __MAX_SUBCHANNEL) || - !is_blacklisted(iter->ssid, iter->devno + 1)) + !is_blacklisted(iter->ssid, iter->devno + 1)) { /* Singular device. */ - return seq_printf(s, "0.%x.%04x\n", - iter->ssid, iter->devno); + seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno); + return 0; + } iter->in_range = 1; - return seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno); + seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno); + return 0; } if ((iter->devno == __MAX_SUBCHANNEL) || !is_blacklisted(iter->ssid, iter->devno + 1)) { /* Last device in range. */ iter->in_range = 0; - return seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno); + seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno); } return 0; } -- cgit v1.2.3 From 25ce319167b517a913a2ba9fc80da8330dbc3249 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:17 -0700 Subject: proc: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++- fs/proc/base.c | 82 +++++++++++++++++++++++++++++++++------------------------ 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 13f047ad08e4..fd02a9ebfc30 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -632,7 +632,9 @@ static int children_seq_show(struct seq_file *seq, void *v) pid_t pid; pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - return seq_printf(seq, "%d ", pid); + seq_printf(seq, "%d ", pid); + + return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3f3d7aeb0712..7a3b82f986dd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -238,13 +238,15 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) + if (lookup_symbol_name(wchan, symname) < 0) { if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; - else - return seq_printf(m, "%lu", wchan); - else - return seq_printf(m, "%s", symname); + seq_printf(m, "%lu", wchan); + } else { + seq_printf(m, "%s", symname); + } + + return 0; } #endif /* CONFIG_KALLSYMS */ @@ -309,10 +311,12 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - return seq_printf(m, "%llu %llu %lu\n", - (unsigned long long)task->se.sum_exec_runtime, - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); + seq_printf(m, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + + return 0; } #endif @@ -387,7 +391,9 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, points = oom_badness(task, NULL, NULL, totalpages) * 1000 / totalpages; read_unlock(&tasklist_lock); - return seq_printf(m, "%lu\n", points); + seq_printf(m, "%lu\n", points); + + return 0; } struct limit_names { @@ -432,15 +438,15 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, * print the file header */ seq_printf(m, "%-25s %-20s %-20s %-10s\n", - "Limit", "Soft Limit", "Hard Limit", "Units"); + "Limit", "Soft Limit", "Hard Limit", "Units"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", - lnames[i].name, "unlimited"); + lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", - lnames[i].name, rlim[i].rlim_cur); + lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); @@ -462,7 +468,9 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, { long nr; unsigned long args[6], sp, pc; - int res = lock_trace(task); + int res; + + res = lock_trace(task); if (res) return res; @@ -477,7 +485,8 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, args[0], args[1], args[2], args[3], args[4], args[5], sp, pc); unlock_trace(task); - return res; + + return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ @@ -2002,12 +2011,13 @@ static int show_timer(struct seq_file *m, void *v) notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "signal: %d/%p\n", + timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], - (notify & SIGEV_THREAD_ID) ? "tid" : "pid", - pid_nr_ns(timer->it_pid, tp->ns)); + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; @@ -2352,21 +2362,23 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unlock_task_sighand(task, &flags); } - result = seq_printf(m, - "rchar: %llu\n" - "wchar: %llu\n" - "syscr: %llu\n" - "syscw: %llu\n" - "read_bytes: %llu\n" - "write_bytes: %llu\n" - "cancelled_write_bytes: %llu\n", - (unsigned long long)acct.rchar, - (unsigned long long)acct.wchar, - (unsigned long long)acct.syscr, - (unsigned long long)acct.syscw, - (unsigned long long)acct.read_bytes, - (unsigned long long)acct.write_bytes, - (unsigned long long)acct.cancelled_write_bytes); + seq_printf(m, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); + result = 0; + out_unlock: mutex_unlock(&task->signal->cred_guard_mutex); return result; -- cgit v1.2.3 From 94ff212d096d96afea29e41585b0d8ce41087c0f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:20 -0700 Subject: cgroup: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 59aa339a245c..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4196,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) static int cgroup_pidlist_show(struct seq_file *s, void *v) { - return seq_printf(s, "%d\n", *(int *)v); + seq_printf(s, "%d\n", *(int *)v); + + return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, -- cgit v1.2.3 From 962e3707d9fb16bcf66ec5e5ebcea5248b9c2ab3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:22 -0700 Subject: tracing: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Miscellanea: o Remove unused return value from trace_lookup_stack Signed-off-by: Joe Perches Acked-by: Steven Rostedt Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c3e4fcfddd45..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) local_irq_enable(); } -static int trace_lookup_stack(struct seq_file *m, long i) +static void trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pS\n", (void *)addr); + seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit v1.2.3 From d50f8f8d91a7dcb6110c4600072c62fc44b65572 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:25 -0700 Subject: lru_cache: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: Lars Ellenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lru_cache.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 852c81e3ba9a..028f5d996eef 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -247,10 +247,11 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) * progress) and "changed", when this in fact lead to an successful * update of the cache. */ - return seq_printf(seq, "\t%s: used:%u/%u " - "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", - lc->name, lc->used, lc->nr_elements, - lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); + seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", + lc->name, lc->used, lc->nr_elements, + lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); + + return 0; } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) -- cgit v1.2.3 From e693d73c20ffdb06840c9378f367bad849ac0d5d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 15 Apr 2015 16:18:28 -0700 Subject: parisc: remove use of seq_printf return value The seq_printf return value, because it's frequently misused, will eventually be converted to void. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") Signed-off-by: Joe Perches Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/parisc/ccio-dma.c | 54 ++++++++++++++--------------- drivers/parisc/sba_iommu.c | 86 ++++++++++++++++++++++------------------------ 2 files changed, 68 insertions(+), 72 deletions(-) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 8b490d77054f..6bc16809c504 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1021,7 +1021,6 @@ static struct hppa_dma_ops ccio_ops = { #ifdef CONFIG_PROC_FS static int ccio_proc_info(struct seq_file *m, void *p) { - int len = 0; struct ioc *ioc = ioc_list; while (ioc != NULL) { @@ -1031,22 +1030,22 @@ static int ccio_proc_info(struct seq_file *m, void *p) int j; #endif - len += seq_printf(m, "%s\n", ioc->name); + seq_printf(m, "%s\n", ioc->name); - len += seq_printf(m, "Cujo 2.0 bug : %s\n", - (ioc->cujo20_bug ? "yes" : "no")); + seq_printf(m, "Cujo 2.0 bug : %s\n", + (ioc->cujo20_bug ? "yes" : "no")); - len += seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n", - total_pages * 8, total_pages); + seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n", + total_pages * 8, total_pages); #ifdef CCIO_COLLECT_STATS - len += seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n", - total_pages - ioc->used_pages, ioc->used_pages, - (int)(ioc->used_pages * 100 / total_pages)); + seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n", + total_pages - ioc->used_pages, ioc->used_pages, + (int)(ioc->used_pages * 100 / total_pages)); #endif - len += seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n", - ioc->res_size, total_pages); + seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n", + ioc->res_size, total_pages); #ifdef CCIO_COLLECT_STATS min = max = ioc->avg_search[0]; @@ -1058,26 +1057,26 @@ static int ccio_proc_info(struct seq_file *m, void *p) min = ioc->avg_search[j]; } avg /= CCIO_SEARCH_SAMPLE; - len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", - min, avg, max); + seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", + min, avg, max); - len += seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n", - ioc->msingle_calls, ioc->msingle_pages, - (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); + seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n", + ioc->msingle_calls, ioc->msingle_pages, + (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); /* KLUGE - unmap_sg calls unmap_single for each mapped page */ min = ioc->usingle_calls - ioc->usg_calls; max = ioc->usingle_pages - ioc->usg_pages; - len += seq_printf(m, "pci_unmap_single: %8ld calls %8ld pages (avg %d/1000)\n", - min, max, (int)((max * 1000)/min)); + seq_printf(m, "pci_unmap_single: %8ld calls %8ld pages (avg %d/1000)\n", + min, max, (int)((max * 1000)/min)); - len += seq_printf(m, "pci_map_sg() : %8ld calls %8ld pages (avg %d/1000)\n", - ioc->msg_calls, ioc->msg_pages, - (int)((ioc->msg_pages * 1000)/ioc->msg_calls)); + seq_printf(m, "pci_map_sg() : %8ld calls %8ld pages (avg %d/1000)\n", + ioc->msg_calls, ioc->msg_pages, + (int)((ioc->msg_pages * 1000)/ioc->msg_calls)); - len += seq_printf(m, "pci_unmap_sg() : %8ld calls %8ld pages (avg %d/1000)\n\n\n", - ioc->usg_calls, ioc->usg_pages, - (int)((ioc->usg_pages * 1000)/ioc->usg_calls)); + seq_printf(m, "pci_unmap_sg() : %8ld calls %8ld pages (avg %d/1000)\n\n\n", + ioc->usg_calls, ioc->usg_pages, + (int)((ioc->usg_pages * 1000)/ioc->usg_calls)); #endif /* CCIO_COLLECT_STATS */ ioc = ioc->next; @@ -1101,7 +1100,6 @@ static const struct file_operations ccio_proc_info_fops = { static int ccio_proc_bitmap_info(struct seq_file *m, void *p) { - int len = 0; struct ioc *ioc = ioc_list; while (ioc != NULL) { @@ -1110,11 +1108,11 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p) for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) { if ((j & 7) == 0) - len += seq_puts(m, "\n "); - len += seq_printf(m, "%08x", *res_ptr); + seq_puts(m, "\n "); + seq_printf(m, "%08x", *res_ptr); res_ptr++; } - len += seq_puts(m, "\n\n"); + seq_puts(m, "\n\n"); ioc = ioc->next; break; /* XXX - remove me */ } diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 1ff1b67e8b27..f07471264689 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1774,37 +1774,35 @@ static int sba_proc_info(struct seq_file *m, void *p) #ifdef SBA_COLLECT_STATS unsigned long avg = 0, min, max; #endif - int i, len = 0; - - len += seq_printf(m, "%s rev %d.%d\n", - sba_dev->name, - (sba_dev->hw_rev & 0x7) + 1, - (sba_dev->hw_rev & 0x18) >> 3 - ); - len += seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n", - (int) ((ioc->res_size << 3) * sizeof(u64)), /* 8 bits/byte */ - total_pages); - - len += seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n", - ioc->res_size, ioc->res_size << 3); /* 8 bits per byte */ - - len += seq_printf(m, "LMMIO_BASE/MASK/ROUTE %08x %08x %08x\n", - READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_BASE), - READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_MASK), - READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_ROUTE) - ); + int i; + + seq_printf(m, "%s rev %d.%d\n", + sba_dev->name, + (sba_dev->hw_rev & 0x7) + 1, + (sba_dev->hw_rev & 0x18) >> 3); + seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n", + (int)((ioc->res_size << 3) * sizeof(u64)), /* 8 bits/byte */ + total_pages); + + seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n", + ioc->res_size, ioc->res_size << 3); /* 8 bits per byte */ + + seq_printf(m, "LMMIO_BASE/MASK/ROUTE %08x %08x %08x\n", + READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_BASE), + READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_MASK), + READ_REG32(sba_dev->sba_hpa + LMMIO_DIST_ROUTE)); for (i=0; i<4; i++) - len += seq_printf(m, "DIR%d_BASE/MASK/ROUTE %08x %08x %08x\n", i, - READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_BASE + i*0x18), - READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_MASK + i*0x18), - READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_ROUTE + i*0x18) - ); + seq_printf(m, "DIR%d_BASE/MASK/ROUTE %08x %08x %08x\n", + i, + READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_BASE + i*0x18), + READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_MASK + i*0x18), + READ_REG32(sba_dev->sba_hpa + LMMIO_DIRECT0_ROUTE + i*0x18)); #ifdef SBA_COLLECT_STATS - len += seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n", - total_pages - ioc->used_pages, ioc->used_pages, - (int) (ioc->used_pages * 100 / total_pages)); + seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n", + total_pages - ioc->used_pages, ioc->used_pages, + (int)(ioc->used_pages * 100 / total_pages)); min = max = ioc->avg_search[0]; for (i = 0; i < SBA_SEARCH_SAMPLE; i++) { @@ -1813,26 +1811,26 @@ static int sba_proc_info(struct seq_file *m, void *p) if (ioc->avg_search[i] < min) min = ioc->avg_search[i]; } avg /= SBA_SEARCH_SAMPLE; - len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", - min, avg, max); + seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", + min, avg, max); - len += seq_printf(m, "pci_map_single(): %12ld calls %12ld pages (avg %d/1000)\n", - ioc->msingle_calls, ioc->msingle_pages, - (int) ((ioc->msingle_pages * 1000)/ioc->msingle_calls)); + seq_printf(m, "pci_map_single(): %12ld calls %12ld pages (avg %d/1000)\n", + ioc->msingle_calls, ioc->msingle_pages, + (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); /* KLUGE - unmap_sg calls unmap_single for each mapped page */ min = ioc->usingle_calls; max = ioc->usingle_pages - ioc->usg_pages; - len += seq_printf(m, "pci_unmap_single: %12ld calls %12ld pages (avg %d/1000)\n", - min, max, (int) ((max * 1000)/min)); + seq_printf(m, "pci_unmap_single: %12ld calls %12ld pages (avg %d/1000)\n", + min, max, (int)((max * 1000)/min)); - len += seq_printf(m, "pci_map_sg() : %12ld calls %12ld pages (avg %d/1000)\n", - ioc->msg_calls, ioc->msg_pages, - (int) ((ioc->msg_pages * 1000)/ioc->msg_calls)); + seq_printf(m, "pci_map_sg() : %12ld calls %12ld pages (avg %d/1000)\n", + ioc->msg_calls, ioc->msg_pages, + (int)((ioc->msg_pages * 1000)/ioc->msg_calls)); - len += seq_printf(m, "pci_unmap_sg() : %12ld calls %12ld pages (avg %d/1000)\n", - ioc->usg_calls, ioc->usg_pages, - (int) ((ioc->usg_pages * 1000)/ioc->usg_calls)); + seq_printf(m, "pci_unmap_sg() : %12ld calls %12ld pages (avg %d/1000)\n", + ioc->usg_calls, ioc->usg_pages, + (int)((ioc->usg_pages * 1000)/ioc->usg_calls)); #endif return 0; @@ -1858,14 +1856,14 @@ sba_proc_bitmap_info(struct seq_file *m, void *p) struct sba_device *sba_dev = sba_list; struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */ unsigned int *res_ptr = (unsigned int *)ioc->res_map; - int i, len = 0; + int i; for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) { if ((i & 7) == 0) - len += seq_printf(m, "\n "); - len += seq_printf(m, " %08x", *res_ptr); + seq_puts(m, "\n "); + seq_printf(m, " %08x", *res_ptr); } - len += seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } -- cgit v1.2.3