From 7bc98b97ed5dfe710025414de771baa674998892 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: Add Andi Kleen as hwpoison maintainer to MAINTAINERS Signed-off-by: Andi Kleen --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0a32c3ec6b1..99a0e513cd7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2377,6 +2377,15 @@ W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ S: Maintained F: drivers/hwmon/hdaps.c +HWPOISON MEMORY FAILURE HANDLING +M: Andi Kleen +L: linux-mm@kvack.org +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison +S: Maintained +F: mm/memory-failure.c +F: mm/hwpoison-inject.c + HYPERVISOR VIRTUAL CONSOLE DRIVER L: linuxppc-dev@ozlabs.org S: Odd Fixes -- cgit v1.2.3 From 588f9ce6ca61ecb4663ee6ef2f75d2d96c73151e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: Be more aggressive at freeing non LRU caches shake_page handles more types of page caches than lru_drain_all() - per cpu page allocator pages - per CPU LRU Stops early when the page became free. Used in followon patches. Signed-off-by: Andi Kleen --- include/linux/mm.h | 1 + mm/memory-failure.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d65ae4ba0e..68c84bb2ad3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1335,6 +1335,7 @@ extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int ref); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; +extern void shake_page(struct page *p); extern atomic_long_t mce_bad_pages; #endif /* __KERNEL__ */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 50d4f8d7024..38fcbb22eab 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -82,6 +82,28 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, return ret; } +/* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + /* + * Could call shrink_slab here (which would also + * shrink other caches). Unfortunately that might + * also access the corrupted page, which could be fatal. + */ +} +EXPORT_SYMBOL_GPL(shake_page); + /* * Kill all processes that have a poisoned page mapped and then isolate * the page. -- cgit v1.2.3 From 0e9052eb98a9986ec0669d030604f7a68f6df638 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: page-types: add standard GPL license header Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/page-types.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 7a7d9bab32e..66e9358e214 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -1,11 +1,22 @@ /* * page-types: Tool for querying page flags * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should find a copy of v2 of the GNU General Public License somewhere on + * your Linux system; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * * Copyright (C) 2009 Intel corporation * * Authors: Wu Fengguang - * - * Released under the General Public License (GPL). */ #define _LARGEFILE64_SOURCE -- cgit v1.2.3 From 9b9a29ecd75e310f75a9243e1c3538ad34598fcb Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: remove the anonymous entry (PG_swapbacked && !PG_lru) pages should not happen. Better to treat them as unknown pages. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 38fcbb22eab..745f61082ce 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -609,7 +609,6 @@ static struct page_state { { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, /* * Catchall entry: must be at end. -- cgit v1.2.3 From a7560fc80f33cab33176ee78f146df22b28e3338 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: return ENXIO on invalid page number Use a different errno than the usual EIO for invalid page numbers. This is mainly for better reporting for the injector. This also avoids calling action_result() with invalid pfn. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 745f61082ce..275f4e2df8a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -618,13 +618,11 @@ static struct page_state { static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); + struct page *page = pfn_to_page(pfn); printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", pfn, - page && PageDirty(page) ? "dirty " : "", + PageDirty(page) ? "dirty " : "", msg, action_name[result]); } @@ -750,8 +748,10 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - action_result(pfn, "memory outside kernel control", IGNORED); - return -EIO; + printk(KERN_ERR + "MCE %#lx: memory outside kernel control\n", + pfn); + return -ENXIO; } p = pfn_to_page(pfn); -- cgit v1.2.3 From bd1ce5f91f545730df4af492f774d9d32f5da3cb Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: avoid grabbing the page count multiple times during madvise injection If page is double referenced in madvise_hwpoison() and __memory_failure(), remove_mapping() will fail because it expects page_count=2. Fix it by not grabbing extra page count in __memory_failure(). Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/madvise.c | 1 - mm/memory-failure.c | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9..18970aec0d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -238,7 +238,6 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) page_to_pfn(p), start); /* Ignore return value for now */ __memory_failure(page_to_pfn(p), 0, 1); - put_page(p); } return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 275f4e2df8a..4253e14fa70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -627,7 +627,7 @@ static void action_result(unsigned long pfn, char *msg, int result) } static int page_action(struct page_state *ps, struct page *p, - unsigned long pfn, int ref) + unsigned long pfn) { int result; int count; @@ -635,7 +635,7 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); action_result(pfn, ps->msg, result); - count = page_count(p) - 1 - ref; + count = page_count(p) - 1; if (count != 0) printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", @@ -773,7 +773,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!ref && !get_page_unless_zero(compound_head(p))) { action_result(pfn, "free or high order kernel", IGNORED); return PageBuddy(compound_head(p)) ? 0 : -EBUSY; } @@ -821,7 +821,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) res = -EBUSY; for (ps = error_states;; ps++) { if (((p->flags | lru_flag)& ps->mask) == ps->res) { - res = page_action(ps, p, pfn, ref); + res = page_action(ps, p, pfn); break; } } -- cgit v1.2.3 From 82ba011b9041dd31c15e4f63797b08aa0a288e61 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: HWPOISON: Turn ref argument into flags argument Now that "ref" is just a boolean turn it into a flags argument. First step is only a single flag that makes the code's intention more clear, but more may follow. Signed-off-by: Andi Kleen --- include/linux/mm.h | 5 ++++- mm/madvise.c | 2 +- mm/memory-failure.c | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 68c84bb2ad3..135e19198cd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1331,8 +1331,11 @@ extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, size_t size); extern void refund_locked_memory(struct mm_struct *mm, size_t size); +enum mf_flags { + MF_COUNT_INCREASED = 1 << 0, +}; extern void memory_failure(unsigned long pfn, int trapno); -extern int __memory_failure(unsigned long pfn, int trapno, int ref); +extern int __memory_failure(unsigned long pfn, int trapno, int flags); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); diff --git a/mm/madvise.c b/mm/madvise.c index 18970aec0d2..6ca34f0cd4a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -237,7 +237,7 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, 1); + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4253e14fa70..3338c443272 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -737,7 +737,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, ret != SWAP_SUCCESS, pfn); } -int __memory_failure(unsigned long pfn, int trapno, int ref) +int __memory_failure(unsigned long pfn, int trapno, int flags) { unsigned long lru_flag; struct page_state *ps; @@ -773,7 +773,8 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!ref && !get_page_unless_zero(compound_head(p))) { + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(compound_head(p))) { action_result(pfn, "free or high order kernel", IGNORED); return PageBuddy(compound_head(p)) ? 0 : -EBUSY; } -- cgit v1.2.3 From 1668bfd5be9d8a52536c4865000fbbe065a3613b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: abort on failed unmap Don't try to isolate a still mapped page. Otherwise we will hit the BUG_ON(page_mapped(page)) in __remove_from_page_cache(). Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3338c443272..b62287db87a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -655,7 +655,7 @@ static int page_action(struct page_state *ps, struct page *p, * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static void hwpoison_user_mappings(struct page *p, unsigned long pfn, +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; @@ -665,15 +665,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) - return; + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(p)) - return; + return SWAP_SUCCESS; + + if (PageCompound(p) || PageKsm(p)) + return SWAP_FAIL; if (PageSwapCache(p)) { printk(KERN_ERR @@ -735,6 +738,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, */ kill_procs_ao(&tokill, !!PageDirty(p), trapno, ret != SWAP_SUCCESS, pfn); + + return ret; } int __memory_failure(unsigned long pfn, int trapno, int flags) @@ -807,8 +812,13 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. + * Abort on fail: __remove_from_page_cache() assumes unmapped page. */ - hwpoison_user_mappings(p, pfn, trapno); + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; + } /* * Torn down by someone else? -- cgit v1.2.3 From db0480b3a61bd6ad86ead3b8bbad094ab0996932 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: comment the possible set_page_dirty() race Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b62287db87a..dc47415a551 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -687,6 +687,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, /* * Propagate the dirty bit from PTEs to struct page first, because we * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(p); if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { -- cgit v1.2.3 From 71f72525dfaaec012e23089c73331654ea7b12d3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: comment dirty swapcache pages AK: Improve comment Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index a54b2c49844..db09106ed44 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2553,6 +2553,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } else if (PageHWPoison(page)) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto out_release; -- cgit v1.2.3 From dc2a1cbf7d862e9d0abea1d1b4c8712dfbb5a398 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: introduce delete_from_lru_cache() Introduce delete_from_lru_cache() to - clear PG_active, PG_unevictable to avoid complains at unpoison time - move the isolate_lru_page() call back to the handlers instead of the entrance of __memory_failure(), this is more hwpoison filter friendly Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dc47415a551..9a285f8cdbe 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -349,6 +349,30 @@ static const char *action_name[] = { [RECOVERED] = "Recovered", }; +/* + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. + */ +static int delete_from_lru_cache(struct page *p) +{ + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + /* + * drop the page count elevated by isolate_lru_page() + */ + page_cache_release(p); + return 0; + } + return -EIO; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we @@ -393,6 +417,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) int ret = FAILED; struct address_space *mapping; + delete_from_lru_cache(p); + /* * For anonymous pages we're done the only reference left * should be the one m_f() holds. @@ -522,14 +548,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) /* Trigger EIO in shmem: */ ClearPageUptodate(p); - return DELAYED; + if (!delete_from_lru_cache(p)) + return DELAYED; + else + return FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { delete_from_swap_cache(p); - return RECOVERED; + if (!delete_from_lru_cache(p)) + return RECOVERED; + else + return FAILED; } /* @@ -746,7 +778,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int __memory_failure(unsigned long pfn, int trapno, int flags) { - unsigned long lru_flag; struct page_state *ps; struct page *p; int res; @@ -796,13 +827,11 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageLRU(p)) lru_add_drain_all(); - lru_flag = p->flags & lru; - if (isolate_lru_page(p)) { + if (!PageLRU(p)) { action_result(pfn, "non LRU", IGNORED); put_page(p); return -EBUSY; } - page_cache_release(p); /* * Lock the page and wait for writeback to finish. @@ -825,7 +854,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Torn down by someone else? */ - if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); res = 0; goto out; @@ -833,7 +862,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) res = -EBUSY; for (ps = error_states;; ps++) { - if (((p->flags | lru_flag)& ps->mask) == ps->res) { + if ((p->flags & ps->mask) == ps->res) { res = page_action(ps, p, pfn); break; } -- cgit v1.2.3 From 95d01fc664b9476e0d18e3d745bb209a42a33588 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: remove the free buddy page handler The buddy page has already be handled in the very beginning. So remove redundant code. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9a285f8cdbe..676ab394200 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -400,14 +400,6 @@ static int me_unknown(struct page *p, unsigned long pfn) return FAILED; } -/* - * Free memory - */ -static int me_free(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - /* * Clean (or cleaned) page cache page. */ @@ -604,7 +596,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define tail (1UL << PG_tail) #define compound (1UL << PG_compound) #define slab (1UL << PG_slab) -#define buddy (1UL << PG_buddy) #define reserved (1UL << PG_reserved) static struct page_state { @@ -614,7 +605,10 @@ static struct page_state { int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, "reserved kernel", me_ignore }, - { buddy, buddy, "free kernel", me_free }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ /* * Could in theory check if slab page is free or if we can drop -- cgit v1.2.3 From 8d22ba1b74aa9420b6032d856446564fb21f8090 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: detect free buddy pages explicitly Most free pages in the buddy system have no PG_buddy set. Introduce is_free_buddy_page() for detecting them reliably. CC: Nick Piggin CC: Mel Gorman Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/internal.h | 3 +++ mm/memory-failure.c | 9 +++++++-- mm/page_alloc.c | 21 +++++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 4fe67a162cb..49b2ff776b7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); */ extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); +#ifdef CONFIG_MEMORY_FAILURE +extern bool is_free_buddy_page(struct page *page); +#endif /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 676ab394200..5055b940df5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -807,8 +807,13 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) */ if (!(flags & MF_COUNT_INCREASED) && !get_page_unless_zero(compound_head(p))) { - action_result(pfn, "free or high order kernel", IGNORED); - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy", DELAYED); + return 0; + } else { + action_result(pfn, "high order kernel", IGNORED); + return -EBUSY; + } } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59d2e88fb47..6867b4d391f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5081,3 +5081,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); } #endif + +#ifdef CONFIG_MEMORY_FAILURE +bool is_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int order; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + + return order < MAX_ORDER; +} +#endif -- cgit v1.2.3 From 847ce401df392b0704369fd3f75df614ac1414b4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: Add unpoisoning support The unpoisoning interface is useful for stress testing tools to reclaim poisoned pages (to prevent OOM) There is no hardware level unpoisioning, so this cannot be used for real memory errors, only for software injected errors. Note that it may leak pages silently - those who have been removed from LRU cache, but not isolated from page cache/swap cache at hwpoison time. Especially the stress test of dirty swap cache pages shall reboot system before exhausting memory. AK: Fix comments, add documentation, add printks, rename symbol Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 16 ++++++++-- include/linux/mm.h | 1 + include/linux/page-flags.h | 2 +- mm/hwpoison-inject.c | 36 +++++++++++++++++++---- mm/memory-failure.c | 68 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 9 deletions(-) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 3ffadf8da61..f047e75acb2 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -98,10 +98,22 @@ madvise(MADV_POISON, ....) hwpoison-inject module through debugfs - /sys/debug/hwpoison/corrupt-pfn -Inject hwpoison fault at PFN echoed into this file +/sys/debug/hwpoison/ +corrupt-pfn + +Inject hwpoison fault at PFN echoed into this file. + +unpoison-pfn + +Software-unpoison page at PFN echoed into this file. This +way a page can be reused again. +This only works for Linux injected failures, not for real +memory failures. + +Note these injection interfaces are not stable and might change between +kernel versions Architecture specific MCE injector diff --git a/include/linux/mm.h b/include/linux/mm.h index 135e19198cd..8cdb941fc7b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1336,6 +1336,7 @@ enum mf_flags { }; extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 49e907bd067..f9df6308af9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -275,7 +275,7 @@ PAGEFLAG_FALSE(Uncached) #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison) -TESTSETFLAG(HWPoison, hwpoison) +TESTSCFLAG(HWPoison, hwpoison) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f08..6e35e563bf5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -4,7 +4,7 @@ #include #include -static struct dentry *hwpoison_dir, *corrupt_pfn; +static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { @@ -14,7 +14,16 @@ static int hwpoison_inject(void *data, u64 val) return __memory_failure(val, 18, 0); } +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { @@ -24,16 +33,31 @@ static void pfn_inject_exit(void) static int pfn_inject_init(void) { + struct dentry *dentry; + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); if (hwpoison_dir == NULL) return -ENOMEM; - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, NULL, &hwpoison_fops); - if (corrupt_pfn == NULL) { - pfn_inject_exit(); - return -ENOMEM; - } + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; } module_init(pfn_inject_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5055b940df5..ed6e91c87a5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -838,6 +838,16 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * and in many cases impossible, so we just avoid it here. */ lock_page_nosync(p); + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + action_result(pfn, "unpoisoned", IGNORED); + res = 0; + goto out; + } + wait_on_page_writeback(p); /* @@ -893,3 +903,61 @@ void memory_failure(unsigned long pfn, int trapno) { __memory_failure(pfn, trapno, 0); } + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + if (!get_page_unless_zero(page)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page_nosync(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(p)) { + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_dec(&mce_bad_pages); + freeit = 1; + } + unlock_page(page); + + put_page(page); + if (freeit) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); -- cgit v1.2.3 From d95ea51e3a7e9ee051d19f1dd283ca61d1aa5ec6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: make semantics of IGNORED/DELAYED clear Change semantics for - IGNORED: not handled; it may well be _unsafe_ - DELAYED: to be handled later; it is _safe_ With this change, - IGNORED/FAILED mean (maybe) Error - DELAYED/RECOVERED mean Success Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ed6e91c87a5..fd1ac1537f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -336,16 +336,16 @@ static void collect_procs(struct page *page, struct list_head *tokill) */ enum outcome { - FAILED, /* Error handling failed */ + IGNORED, /* Error: cannot be handled */ + FAILED, /* Error: handling failed */ DELAYED, /* Will be handled later */ - IGNORED, /* Error safely ignored */ RECOVERED, /* Successfully recovered */ }; static const char *action_name[] = { + [IGNORED] = "Ignored", [FAILED] = "Failed", [DELAYED] = "Delayed", - [IGNORED] = "Ignored", [RECOVERED] = "Recovered", }; @@ -379,14 +379,6 @@ static int delete_from_lru_cache(struct page *p) * could be more sophisticated. */ static int me_kernel(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - -/* - * Already poisoned page. - */ -static int me_ignore(struct page *p, unsigned long pfn) { return IGNORED; } @@ -604,7 +596,7 @@ static struct page_state { char *msg; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_ignore }, + { reserved, reserved, "reserved kernel", me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -788,7 +780,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) p = pfn_to_page(pfn); if (TestSetPageHWPoison(p)) { - action_result(pfn, "already hardware poisoned", IGNORED); + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -843,7 +835,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - action_result(pfn, "unpoisoned", IGNORED); + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); res = 0; goto out; } @@ -865,7 +857,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); - res = 0; + res = -EBUSY; goto out; } -- cgit v1.2.3 From 138ce286eb6ee6d39ca4fb50516e93adaf6b605f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: return 0 to indicate success reliably Return 0 to indicate success, when - action result is RECOVERED or DELAYED - no extra page reference Note that dirty swapcache pages are kept in swapcache, so can have one more reference count. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fd1ac1537f0..edeaf2319e7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -654,17 +654,21 @@ static int page_action(struct page_state *ps, struct page *p, action_result(pfn, ps->msg, result); count = page_count(p) - 1; - if (count != 0) + if (ps->action == me_swapcache_dirty && result == DELAYED) + count--; + if (count != 0) { printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", pfn, ps->msg, count); + result = FAILED; + } /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return result == RECOVERED ? 0 : -EBUSY; + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } #define N_UNMAP_TRIES 5 -- cgit v1.2.3 From 7c116f2b0dbac4a1dd051c7a5e8cef37701cafd4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add fs/device filters Filesystem data/metadata present the most tricky-to-isolate pages. It requires careful code review and stress testing to get them right. The fs/device filter helps to target the stress tests to some specific filesystem pages. The filter condition is block device's major/minor numbers: - corrupt-filter-dev-major - corrupt-filter-dev-minor When specified (non -1), only page cache pages that belong to that device will be poisoned. The filters are checked reliably on the locked and refcounted page. Haicheng: clear PG_hwpoison and drop bad page count if filter not OK AK: Add documentation CC: Haicheng Li CC: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 7 ++++++ mm/hwpoison-inject.c | 11 ++++++++++ mm/internal.h | 3 +++ mm/memory-failure.c | 51 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index f047e75acb2..fdf58046432 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -115,6 +115,13 @@ memory failures. Note these injection interfaces are not stable and might change between kernel versions +corrupt-filter-dev-major +corrupt-filter-dev-minor + +Only handle memory failures to pages associated with the file system defined +by block device major/minor. -1U is the wildcard value. +This should be only used for testing with artificial injection. + Architecture specific MCE injector x86 has mce-inject, mce-test diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 6e35e563bf5..ac692a9b766 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -3,6 +3,7 @@ #include #include #include +#include "internal.h" static struct dentry *hwpoison_dir; @@ -54,6 +55,16 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, + hwpoison_dir, &hwpoison_filter_dev_major); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, + hwpoison_dir, &hwpoison_filter_dev_minor); + if (!dentry) + goto fail; + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index 49b2ff776b7..814da335f05 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -250,3 +250,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 #endif + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index edeaf2319e7..82ac73436d0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -48,6 +48,50 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab page + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +int hwpoison_filter(struct page *p) +{ + if (hwpoison_filter_dev(p)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(hwpoison_filter); + /* * Send all the processes who have the page mapped an ``action optional'' * signal. @@ -843,6 +887,13 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) res = 0; goto out; } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + unlock_page(p); + put_page(p); + return 0; + } wait_on_page_writeback(p); -- cgit v1.2.3 From 31d3d3484f9bd263925ecaa341500ac2df3a5d9b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: limit hwpoison injector to known page types __memory_failure()'s workflow is set PG_hwpoison //... unset PG_hwpoison if didn't pass hwpoison filter That could kill unrelated process if it happens to page fault on the page with the (temporary) PG_hwpoison. The race should be big enough to appear in stress tests. Fix it by grabbing the page and checking filter at inject time. This also avoids the very noisy "Injecting memory failure..." messages. - we don't touch madvise() based injection, because the filters are generally not necessary for it. - if we want to apply the filters to h/w aided injection, we'd better to rearrange the logic in __memory_failure() instead of this patch. AK: fix documentation, use drain all, cleanups CC: Haicheng Li Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 3 ++- mm/hwpoison-inject.c | 41 +++++++++++++++++++++++++++++++++++++++-- mm/internal.h | 2 ++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index fdf58046432..4ef7bb30d15 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -103,7 +103,8 @@ hwpoison-inject module through debugfs corrupt-pfn -Inject hwpoison fault at PFN echoed into this file. +Inject hwpoison fault at PFN echoed into this file. This does +some early filtering to avoid corrupted unintended pages in test suites. unpoison-pfn diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index ac692a9b766..2b6b3200fa6 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -3,16 +3,53 @@ #include #include #include +#include +#include #include "internal.h" static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { + unsigned long pfn = val; + struct page *p; + int err; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); - return __memory_failure(val, 18, 0); + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + /* + * This implies unable to support free buddy pages. + */ + if (!get_page_unless_zero(p)) + return 0; + + if (!PageLRU(p)) + shake_page(p); + /* + * This implies unable to support non-LRU pages. + */ + if (!PageLRU(p)) + return 0; + + /* + * do a racy check with elevated page count, to make sure PG_hwpoison + * will only be set for the targeted owner (or on a free page). + * We temporarily take page lock for try_get_mem_cgroup_from_page(). + * __memory_failure() will redo the check reliably inside page lock. + */ + lock_page(p); + err = hwpoison_filter(p); + unlock_page(p); + if (err) + return 0; + + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/internal.h b/mm/internal.h index 814da335f05..04bbce8b8ba 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -251,5 +251,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SUCCESS 1 #endif +extern int hwpoison_filter(struct page *p); + extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; -- cgit v1.2.3 From 1a9b5b7fe0c5dad8a635288882d36785dea742f9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: mm: export stable page flags Rename get_uflags() to stable_page_flags() and make it a global function for use in the hwpoison page flags filter, which need to compare user page flags with the value provided by user space. Also move KPF_* to kernel-page-flags.h for use by user space tools. Acked-by: Matt Mackall Signed-off-by: Andi Kleen CC: Nick Piggin CC: Christoph Lameter Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- fs/proc/page.c | 45 +++----------------------------------- include/linux/kernel-page-flags.h | 46 +++++++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 2 ++ 3 files changed, 51 insertions(+), 42 deletions(-) create mode 100644 include/linux/kernel-page-flags.h diff --git a/fs/proc/page.c b/fs/proc/page.c index 5033ce0d254..180cf5a0bd6 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "internal.h" @@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = { * physical page flags. */ -/* These macros are used to decouple internal flags from exported ones */ - -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -/* 11-20: new additions in 2.6.31 */ -#define KPF_MMAP 11 -#define KPF_ANON 12 -#define KPF_SWAPCACHE 13 -#define KPF_SWAPBACKED 14 -#define KPF_COMPOUND_HEAD 15 -#define KPF_COMPOUND_TAIL 16 -#define KPF_HUGE 17 -#define KPF_UNEVICTABLE 18 -#define KPF_HWPOISON 19 -#define KPF_NOPAGE 20 - -#define KPF_KSM 21 - -/* kernel hacking assistances - * WARNING: subject to change, never rely on them! - */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 - static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -static u64 get_uflags(struct page *page) +u64 stable_page_flags(struct page *page) { u64 k; u64 u; @@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else ppage = NULL; - if (put_user(get_uflags(ppage), out)) { + if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; break; } diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h new file mode 100644 index 00000000000..bd92a89f4b0 --- /dev/null +++ b/include/linux/kernel-page-flags.h @@ -0,0 +1,46 @@ +#ifndef LINUX_KERNEL_PAGE_FLAGS_H +#define LINUX_KERNEL_PAGE_FLAGS_H + +/* + * Stable page flag bits exported to user space + */ + +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +/* 11-20: new additions in 2.6.31 */ +#define KPF_MMAP 11 +#define KPF_ANON 12 +#define KPF_SWAPCACHE 13 +#define KPF_SWAPBACKED 14 +#define KPF_COMPOUND_HEAD 15 +#define KPF_COMPOUND_TAIL 16 +#define KPF_HUGE 17 +#define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 +#define KPF_NOPAGE 20 + +#define KPF_KSM 21 + +/* kernel hacking assistances + * WARNING: subject to change, never rely on them! + */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 + +#endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f9df6308af9..feee2ba8d06 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -282,6 +282,8 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +u64 stable_page_flags(struct page *page); + static inline int PageUptodate(struct page *page) { int ret = test_bit(PG_uptodate, &(page)->flags); -- cgit v1.2.3 From 478c5ffc0b50527bd2390f2daa46cc16276b8413 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add page flags filter When specified, only poison pages if ((page_flags & mask) == value). - corrupt-filter-flags-mask - corrupt-filter-flags-value This allows stress testing of many kinds of pages. Strictly speaking, the buddy pages requires taking zone lock, to avoid setting PG_hwpoison on a "was buddy but now allocated to someone" page. However we can just do nothing because we set PG_locked in the beginning, this prevents the page allocator from allocating it to someone. (It will BUG() on the unexpected PG_locked, which is fine for hwpoison testing.) [AK: Add select PROC_PAGE_MONITOR to satisfy dependency] CC: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 10 ++++++++++ mm/Kconfig | 1 + mm/hwpoison-inject.c | 10 ++++++++++ mm/internal.h | 2 ++ mm/memory-failure.c | 20 ++++++++++++++++++++ 5 files changed, 43 insertions(+) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 4ef7bb30d15..f454d3cd4d6 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -123,6 +123,16 @@ Only handle memory failures to pages associated with the file system defined by block device major/minor. -1U is the wildcard value. This should be only used for testing with artificial injection. + +corrupt-filter-flags-mask +corrupt-filter-flags-value + +When specified, only poison pages if ((page_flags & mask) == value). +This allows stress testing of many kinds of pages. The page_flags +are the same as in /proc/kpageflags. The flag bits are defined in +include/linux/kernel-page-flags.h and documented in +Documentation/vm/pagemap.txt + Architecture specific MCE injector x86 has mce-inject, mce-test diff --git a/mm/Kconfig b/mm/Kconfig index 2310984591e..8cea7fde06e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -253,6 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "Poison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 2b6b3200fa6..c4dfd89f654 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -102,6 +102,16 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index 04bbce8b8ba..b2027c73119 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -255,3 +255,5 @@ extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 82ac73436d0..22d2b2028e5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,12 @@ atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { @@ -83,11 +88,26 @@ static int hwpoison_filter_dev(struct page *p) return 0; } +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + int hwpoison_filter(struct page *p) { if (hwpoison_filter_dev(p)) return -EINVAL; + if (hwpoison_filter_flags(p)) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(hwpoison_filter); -- cgit v1.2.3 From e42d9d5d47961fb5db0be65b56dd52fe7b2421f1 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: rename and export try_get_mem_cgroup_from_page() So that the hwpoison injector can get mem_cgroup for arbitrary page and thus know whether it is owned by some mem_cgroup task(s). [AK: Merged with latest git tree] CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 11 ++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf9213b2db8..fc9bae82ac4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,6 +68,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); +extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); static inline @@ -189,6 +190,11 @@ mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) { } +static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) { return 1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0c2066495e..b5ac61ce734 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1379,25 +1379,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return container_of(css, struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); - if (!PageSwapCache(page)) - return NULL; - pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) mem = NULL; - } else { + } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); @@ -1743,7 +1740,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, */ if (!PageSwapCache(page)) goto charge_cur_mm; - mem = try_get_mem_cgroup_from_swapcache(page); + mem = try_get_mem_cgroup_from_page(page); if (!mem) goto charge_cur_mm; *ptr = mem; -- cgit v1.2.3 From d324236b3333e87c8825b35f2104184734020d35 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: memcg: add accessor to mem_cgroup.css So that an outside user can free the reference count grabbed by try_get_mem_cgroup_from_page(). CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fc9bae82ac4..2c30a1116d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -81,6 +81,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) return cgroup == mem; } +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); + extern int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, @@ -206,6 +208,11 @@ static inline int task_in_mem_cgroup(struct task_struct *task, return 1; } +static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return NULL; +} + static inline int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b5ac61ce734..9eee80d6d49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -282,6 +282,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return &mem->css; +} + static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { -- cgit v1.2.3 From 4fd466eb46a6a917c317a87fb94bfc7252a0f7ed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add memory cgroup filter The hwpoison test suite need to inject hwpoison to a collection of selected task pages, and must not touch pages not owned by them and thus kill important system processes such as init. (But it's OK to mis-hwpoison free/unowned pages as well as shared clean pages. Mis-hwpoison of shared dirty pages will kill all tasks, so the test suite will target all or non of such tasks in the first place.) The memory cgroup serves this purpose well. We can put the target processes under the control of a memory cgroup, and tell the hwpoison injection code to only kill pages associated with some active memory cgroup. The prerequisite for doing hwpoison stress tests with mem_cgroup is, the mem_cgroup code tracks task pages _accurately_ (unless page is locked). Which we believe is/should be true. The benefits are simplification of hwpoison injector code. Also the mem_cgroup code will automatically be tested by hwpoison test cases. The alternative interfaces pin-pfn/unpin-pfn can also delegate the (process and page flags) filtering functions reliably to user space. However prototype implementation shows that this scheme adds more complexity than we wanted. Example test case: mkdir /cgroup/hwpoison usemem -m 100 -s 1000 & echo `jobs -p` > /cgroup/hwpoison/tasks memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg page-types -p `pidof init` --hwpoison # shall do nothing page-types -p `pidof usemem` --hwpoison # poison its pages [AK: Fix documentation] [Add fix for problem noticed by Li Zefan ; dentry in the css could be NULL] CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh CC: KAMEZAWA Hiroyuki CC: Li Zefan CC: Paul Menage CC: Nick Piggin CC: Andi Kleen Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 16 +++++++++++++++ mm/hwpoison-inject.c | 7 +++++++ mm/internal.h | 1 + mm/memory-failure.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index f454d3cd4d6..989e5afe740 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -123,6 +123,22 @@ Only handle memory failures to pages associated with the file system defined by block device major/minor. -1U is the wildcard value. This should be only used for testing with artificial injection. +corrupt-filter-memcg + +Limit injection to pages owned by memgroup. Specified by inode number +of the memcg. + +Example: + mkdir /cgroup/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /cgroup/hwpoison/tasks + + memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages corrupt-filter-flags-mask corrupt-filter-flags-value diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c4dfd89f654..c838735ac31 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -112,6 +112,13 @@ static int pfn_inject_init(void) if (!dentry) goto fail; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, + hwpoison_dir, &hwpoison_filter_memcg); + if (!dentry) + goto fail; +#endif + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index b2027c73119..5a6761bea6a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -257,3 +257,4 @@ extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 22d2b2028e5..117ef159846 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -100,6 +100,49 @@ static int hwpoison_filter_flags(struct page *p) return -EINVAL; } +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + /* root_mem_cgroup has NULL dentries */ + if (!css->cgroup->dentry) + return -EINVAL; + + ino = css->cgroup->dentry->d_inode->i_ino; + css_put(css); + + if (ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + int hwpoison_filter(struct page *p) { if (hwpoison_filter_dev(p)) @@ -108,6 +151,9 @@ int hwpoison_filter(struct page *p) if (hwpoison_filter_flags(p)) return -EINVAL; + if (hwpoison_filter_task(p)) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(hwpoison_filter); -- cgit v1.2.3 From 1bfe5febe34d2be2120803c10720e179186357c9 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add an interface to switch off/on all the page filters In some use cases, user doesn't need extra filtering. E.g. user program can inject errors through madvise syscall to its own pages, however it might not know what the page state exactly is or which inode the page belongs to. So introduce an one-off interface "corrupt-filter-enable". Echo 0 to switch off page filters, and echo 1 to switch on the filters. [AK: changed default to 0] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/hwpoison-inject.c | 5 +++++ mm/internal.h | 1 + mm/memory-failure.c | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c838735ac31..c597f46ac18 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -92,6 +92,11 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, + hwpoison_dir, &hwpoison_filter_enable); + if (!dentry) + goto fail; + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, hwpoison_dir, &hwpoison_filter_dev_major); if (!dentry) diff --git a/mm/internal.h b/mm/internal.h index 5a6761bea6a..6a697bb97fc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -258,3 +258,4 @@ extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; +extern u32 hwpoison_filter_enable; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 117ef159846..2d5f1223bf4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -49,10 +49,12 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +u32 hwpoison_filter_enable = 0; u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; u64 hwpoison_filter_flags_mask; u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); @@ -145,6 +147,9 @@ static int hwpoison_filter_task(struct page *p) { return 0; } int hwpoison_filter(struct page *p) { + if (!hwpoison_filter_enable) + return 0; + if (hwpoison_filter_dev(p)) return -EINVAL; -- cgit v1.2.3 From d15f107d97bd74c74d8f5144843d372666ddbdac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Use get_user_page_fast in hwpoison madvise The previous version didn't take the mmap_sem before calling gup(), which is racy. Use get_user_pages_fast() instead which doesn't need any locks. This is also faster of course, but then it doesn't really matter because this is just a testing path. Based on report from Nick Piggin. Cc: npiggin@suse.de Signed-off-by: Andi Kleen --- mm/madvise.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 6ca34f0cd4a..7964e36ba91 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -230,8 +230,7 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) return -EPERM; for (; start < end; start += PAGE_SIZE) { struct page *p; - int ret = get_user_pages(current, current->mm, start, 1, - 0, 0, &p, NULL); + int ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", -- cgit v1.2.3 From 413f9efbc513d330f00352bb7cba060a729999d3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: mention HWPoison in Kconfig entry Signed-off-by: Andi Kleen --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 8cea7fde06e..43ea8c3a2bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -251,7 +251,7 @@ config MEMORY_FAILURE special hardware support and typically ECC memory. config HWPOISON_INJECT - tristate "Poison pages injector" + tristate "HWPoison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL select PROC_PAGE_MONITOR -- cgit v1.2.3 From fe194d3e100dea323d7b2de96d3b44d0c067ba7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Use correct name for MADV_HWPOISON in documentation Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 989e5afe740..12f9ba20ccb 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -92,7 +92,7 @@ PR_MCE_KILL_GET Testing: -madvise(MADV_POISON, ....) +madvise(MADV_HWPOISON, ....) (as root) Poison a page in the process for testing -- cgit v1.2.3 From 0474a60ec704324577782b1057d05b574388d552 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Use new shake_page in memory_failure shake_page handles more types of page caches than the much simpler lru_add_drain_all: - slab (quite inefficiently for now) - any other caches with a shrinker callback - per cpu page allocator pages - per CPU LRU Use this call to try to turn pages into free or LRU pages. Then handle the case of the page becoming free after drain everything. Signed-off-by: Andi Kleen --- mm/memory-failure.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2d5f1223bf4..ded1d387b4c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -936,8 +936,15 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageLRU(p)) - lru_add_drain_all(); + shake_page(p); if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", DELAYED); + return 0; + } action_result(pfn, "non LRU", IGNORED); put_page(p); return -EBUSY; -- cgit v1.2.3 From 2326c467df4ff814dc07cf1bdaa1e6e0a9c9f21c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Undefine short-hand macros after use to avoid namespace conflict Signed-off-by: Andi Kleen --- mm/memory-failure.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ded1d387b4c..b5c3b6bd511 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -749,6 +749,19 @@ static struct page_state { { 0, 0, "unknown page state", me_unknown }, }; +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef writeback +#undef lru +#undef swapbacked +#undef head +#undef tail +#undef compound +#undef slab +#undef reserved + static void action_result(unsigned long pfn, char *msg, int result) { struct page *page = pfn_to_page(pfn); -- cgit v1.2.3 From facb6011f3993947283fa15d039dacb4ad140230 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Add soft page offline support This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen --- .../ABI/testing/sysfs-memory-page-offline | 44 +++++ drivers/base/memory.c | 61 +++++++ include/linux/mm.h | 3 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 194 ++++++++++++++++++++- 5 files changed, 297 insertions(+), 7 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-memory-page-offline diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline new file mode 100644 index 00000000000..e14703f12fd --- /dev/null +++ b/Documentation/ABI/testing/sysfs-memory-page-offline @@ -0,0 +1,44 @@ +What: /sys/devices/system/memory/soft_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Soft-offline the memory page containing the physical address + written into this file. Input is a hex number specifying the + physical address of the page. The kernel will then attempt + to soft-offline it, by moving the contents elsewhere or + dropping it if possible. The kernel will then be placed + on the bad page list and never be reused. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + The page must be still accessible, not poisoned. The + kernel will never kill anything for this, but rather + fail the offline. Return value is the size of the + number, or a error when the offlining failed. Reading + the file is not allowed. + +What: /sys/devices/system/memory/hard_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Hard-offline the memory page containing the physical + address written into this file. Input is a hex number + specifying the physical address of the page. The + kernel will then attempt to hard-offline the page, by + trying to drop the page or killing any owner or + triggering IO errors if needed. Note this may kill + any processes owning the page. The kernel will avoid + to access this page assuming it's poisoned by the + hardware. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + Return value is the size of the number, or a error when + the offlining failed. + Reading the file is not allowed. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 989429cfed8..c4c8f2e1dd1 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -341,6 +341,64 @@ static inline int memory_probe_init(void) } #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Support for offlining pages of memory + */ + +/* Soft offline a page */ +static ssize_t +store_soft_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + if (!pfn_valid(pfn)) + return -ENXIO; + ret = soft_offline_page(pfn_to_page(pfn), 0); + return ret == 0 ? count : ret; +} + +/* Forcibly offline a page, including killing processes. */ +static ssize_t +store_hard_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + ret = __memory_failure(pfn, 0, 0); + return ret ? ret : count; +} + +static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); +static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); + +static __init int memory_fail_init(void) +{ + int err; + + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_soft_offline_page.attr); + if (!err) + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_hard_offline_page.attr); + return err; +} +#else +static inline int memory_fail_init(void) +{ + return 0; +} +#endif + /* * Note that phys_device is optional. It is here to allow for * differentiation between which *physical* devices each @@ -471,6 +529,9 @@ int __init memory_dev_init(void) } err = memory_probe_init(); + if (!ret) + ret = err; + err = memory_fail_init(); if (!ret) ret = err; err = block_size_init(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cdb941fc7b..849b4a61bd8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1339,8 +1339,9 @@ extern int __memory_failure(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p); +extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; +extern int soft_offline_page(struct page *page, int flags); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c597f46ac18..a77fe3f9e21 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b5c3b6bd511..bcce2875583 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -41,6 +41,9 @@ #include #include #include +#include +#include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, * When a unknown page type is encountered drain as many buffers as possible * in the hope to turn the page into a LRU or free page, which we can handle. */ -void shake_page(struct page *p) +void shake_page(struct page *p, int access) { if (!PageSlab(p)) { lru_add_drain_all(); @@ -211,11 +214,19 @@ void shake_page(struct page *p) if (PageLRU(p) || is_free_buddy_page(p)) return; } + /* - * Could call shrink_slab here (which would also - * shrink other caches). Unfortunately that might - * also access the corrupted page, which could be fatal. + * Only all shrink_slab here (which would also + * shrink other caches) if access is not potentially fatal. */ + if (access) { + int nr; + do { + nr = shrink_slab(1000, GFP_KERNEL, 1000); + if (page_count(p) == 0) + break; + } while (nr > 10); + } } EXPORT_SYMBOL_GPL(shake_page); @@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); if (!PageLRU(p)) { /* * shake_page could have turned it free. @@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn) return 0; } EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + return alloc_pages(GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * The lock_system_sleep prevents a race with memory hotplug, + * because the isolation assumes there's only a single user. + * This is a big hammer, a better would be nicer. + */ + lock_system_sleep(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. + */ + set_migratetype_isolate(p); + if (!get_page_unless_zero(compound_head(p))) { + if (is_free_buddy_page(p)) { + pr_debug("get_any_page: %#lx free buddy page\n", pfn); + /* Set hwpoison bit while page is still isolated */ + SetPageHWPoison(p); + ret = 0; + } else { + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + unset_migratetype_isolate(p); + unlock_system_sleep(); + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + /* + * Page cache page we can handle? + */ + if (!PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = get_any_page(page, pfn, 0); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + } + if (!PageLRU(page)) { + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + + lock_page(page); + wait_on_page_writeback(page); + + /* + * Synchronized using the page lock with memory_failure() + */ + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_debug("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + /* + * Drop count because page migration doesn't like raised + * counts. The page could get re-allocated, but if it becomes + * LRU the isolation will just fail. + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + put_page(page); + if (ret == 1) { + ret = 0; + pr_debug("soft_offline: %#lx: invalidated\n", pfn); + goto done; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + if (!ret) { + LIST_HEAD(pagelist); + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } + } else { + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + if (ret) + return ret; + +done: + atomic_long_add(1, &mce_bad_pages); + SetPageHWPoison(page); + /* keep elevated page count for bad page */ + return ret; +} -- cgit v1.2.3 From afcf938ee0aac4ef95b1a23bac704c6fbeb26de6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Add a madvise() injector for soft page offlining Process based injection is much easier to handle for test programs, who can first bring a page into a specific state and then test. So add a new MADV_SOFT_OFFLINE to soft offline a page, similar to the existing hard offline injector. Signed-off-by: Andi Kleen --- include/asm-generic/mman-common.h | 1 + mm/madvise.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 20111265afd..3da9e2742fa 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -40,6 +40,7 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ diff --git a/mm/madvise.c b/mm/madvise.c index 7964e36ba91..319528b8db7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, /* * Error injection support for memory error handling. */ -static int madvise_hwpoison(unsigned long start, unsigned long end) +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { int ret = 0; @@ -233,6 +234,14 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) int ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + if (bhv == MADV_SOFT_OFFLINE) { + printk(KERN_INFO "Soft offlining page %lx at %lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + break; + continue; + } printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ @@ -333,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; #ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON) - return madvise_hwpoison(start, start+len_in); + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); #endif if (!madvise_behavior_valid(behavior)) return error; -- cgit v1.2.3 From 0d57eb8dfcb92e3dd928d792f4ed2b2fec680bb7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:01 +0100 Subject: HWPOISON: Don't do early filtering if filter is disabled Signed-off-by: Andi Kleen --- mm/hwpoison-inject.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index a77fe3f9e21..10ea71905c1 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -18,6 +18,8 @@ static int hwpoison_inject(void *data, u64 val) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!hwpoison_filter_enable) + goto inject; if (!pfn_valid(pfn)) return -ENXIO; @@ -48,6 +50,7 @@ static int hwpoison_inject(void *data, u64 val) if (err) return 0; +inject: printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); return __memory_failure(pfn, 18, MF_COUNT_INCREASED); } -- cgit v1.2.3 From 12686d153abff397fa0927c620d5a3de84910b72 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:01 +0100 Subject: HWPOISON: Try to allocate migration page on the same node Signed-off-by: Andi Kleen --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bcce2875583..006430b972a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1113,7 +1113,8 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { - return alloc_pages(GFP_HIGHUSER_MOVABLE, 0); + int nid = page_to_nid(p); + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* -- cgit v1.2.3 From f2c03debdfb387fa2e35cac6382779072b8b9209 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:01 +0100 Subject: HWPOISON: Remove stray phrase in a comment Better to have complete sentences. Signed-off-by: Andi Kleen --- mm/memory-failure.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 006430b972a..6a0466ed5bf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -325,7 +325,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * In case something went wrong with munmapping * make sure the process doesn't catch the * signal and then access the memory. Just kill it. - * the signal handlers */ if (fail || tk->addr_valid == 0) { printk(KERN_ERR -- cgit v1.2.3