From d466f2fcb32cd97fd586bfa33f5dba3ac78aadb0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:03 +0200 Subject: HWPOISON: Add page flag for poisoned pages Hardware poisoned pages need special handling in the VM and shouldn't be touched again. This requires a new page flag. Define it here. The page flags wars seem to be over, so it shouldn't be a problem to get a new one. v2: Add TestSetHWPoison (suggested by Johannes Weiner) Acked-by: Christoph Lameter Signed-off-by: Andi Kleen --- include/linux/page-flags.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2b87acfc5f8..9bc5fd9fdbf 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -51,6 +51,9 @@ * PG_buddy is set to indicate that the page is free and in the buddy system * (see mm/page_alloc.c). * + * PG_hwpoison indicates that a page got corrupted in hardware and contains + * data with incorrect ECC bits that triggered a machine check. Accessing is + * not safe since it may cause another machine check. Don't touch! */ /* @@ -101,6 +104,9 @@ enum pageflags { #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ +#endif +#ifdef CONFIG_MEMORY_FAILURE + PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif __NR_PAGEFLAGS, @@ -263,6 +269,15 @@ PAGEFLAG(Uncached, uncached) PAGEFLAG_FALSE(Uncached) #endif +#ifdef CONFIG_MEMORY_FAILURE +PAGEFLAG(HWPoison, hwpoison) +TESTSETFLAG(HWPoison, hwpoison) +#define __PG_HWPOISON (1UL << PG_hwpoison) +#else +PAGEFLAG_FALSE(HWPoison) +#define __PG_HWPOISON 0 +#endif + static inline int PageUptodate(struct page *page) { int ret = test_bit(PG_uptodate, &(page)->flags); @@ -387,7 +402,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit v1.2.3 From 10be22dfe1e6ad978269dc275147e0ed049187bb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:04 +0200 Subject: HWPOISON: Export some rmap vma locking to outside world Needed for later patch that walks rmap entries on its own. This used to be very frowned upon, but memory-failure.c does some rather specialized rmap walking and rmap has been stable for quite some time, so I think it's ok now to export it. Signed-off-by: Andi Kleen --- include/linux/rmap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf116d0dbf2..8dff2ffab82 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -112,6 +112,12 @@ int page_mkclean(struct page *); */ int try_to_munlock(struct page *); +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *page_lock_anon_vma(struct page *page); +void page_unlock_anon_vma(struct anon_vma *anon_vma); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) -- cgit v1.2.3 From a7420aa54dbf699a5a05feba3c859b6baaa3938c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:05 +0200 Subject: HWPOISON: Add support for poison swap entries v2 Memory migration uses special swap entry types to trigger special actions on page faults. Extend this mechanism to also support poisoned swap entries, to trigger poison handling on page faults. This allows follow-on patches to prevent processes from faulting in poisoned pages again. v2: Fix overflow in MAX_SWAPFILES (Fengguang Wu) v3: Better overflow fix (Hidehiro Kawai) Signed-off-by: Andi Kleen --- include/linux/swap.h | 34 ++++++++++++++++++++++++++++------ include/linux/swapops.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7c15334f3ff..f077e454c65 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -34,15 +34,37 @@ static inline int current_is_kswapd(void) * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 -#ifndef CONFIG_MIGRATION -#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) + +/* + * Use some of the swap files numbers for other purposes. This + * is a convenient way to hook into the VM to trigger special + * actions on faults. + */ + +/* + * NUMA node memory migration support + */ +#ifdef CONFIG_MIGRATION +#define SWP_MIGRATION_NUM 2 +#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #else -/* Use last two entries for page migration swap entries */ -#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) -#define SWP_MIGRATION_READ MAX_SWAPFILES -#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) +#define SWP_MIGRATION_NUM 0 #endif +/* + * Handling of hardware poisoned pages with memory corruption. + */ +#ifdef CONFIG_MEMORY_FAILURE +#define SWP_HWPOISON_NUM 1 +#define SWP_HWPOISON MAX_SWAPFILES +#else +#define SWP_HWPOISON_NUM 0 +#endif + +#define MAX_SWAPFILES \ + ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6ec39ab27b4..cd42e30b7c6 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry) #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) +static inline int non_swap_entry(swp_entry_t entry) +{ + return swp_type(entry) >= MAX_SWAPFILES; +} +#else +static inline int non_swap_entry(swp_entry_t entry) +{ + return 0; +} +#endif -- cgit v1.2.3 From d1737fdbec7f90edc52dd0c5c3767457f28e78d8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:06 +0200 Subject: HWPOISON: Add basic support for poisoned pages in fault handler v3 - Add a new VM_FAULT_HWPOISON error code to handle_mm_fault. Right now architectures have to explicitely enable poison page support, so this is forward compatible to all architectures. They only need to add it when they enable poison page support. - Add poison page handling in swap in fault code v2: Add missing delayacct_clear_flag (Hidehiro Kawai) v3: Really use delayacct_clear_flag (Hidehiro Kawai) Signed-off-by: Andi Kleen --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a72cc78e6b..082b68cb5ff 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -685,11 +685,12 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ +#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. -- cgit v1.2.3 From 14fa31b89c5ae79e4131da41761378a6df674352 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:10 +0200 Subject: HWPOISON: Use bitmask/action code for try_to_unmap behaviour try_to_unmap currently has multiple modi (migration, munlock, normal unmap) which are selected by magic flag variables. The logic is not very straight forward, because each of these flag change multiple behaviours (e.g. migration turns off aging, not only sets up migration ptes etc.) Also the different flags interact in magic ways. A later patch in this series adds another mode to try_to_unmap, so this becomes quickly unmanageable. Replace the different flags with a action code (migration, munlock, munmap) and some additional flags as modifiers (ignore mlock, ignore aging). This makes the logic more straight forward and allows easier extension to new behaviours. Change all the caller to declare what they want to do. This patch is supposed to be a nop in behaviour. If anyone can prove it is not that would be a bug. Cc: Lee.Schermerhorn@hp.com Cc: npiggin@suse.de Signed-off-by: Andi Kleen --- include/linux/rmap.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8dff2ffab82..4c4a2d4d289 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,7 +85,18 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt, unsigned long *vm_flags); -int try_to_unmap(struct page *, int ignore_refs); +enum ttu_flags { + TTU_UNMAP = 0, /* unmap mode */ + TTU_MIGRATION = 1, /* migration mode */ + TTU_MUNLOCK = 2, /* munlock mode */ + TTU_ACTION_MASK = 0xff, + + TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ + TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ +}; +#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) + +int try_to_unmap(struct page *, enum ttu_flags flags); /* * Called from mm/filemap_xip.c to unmap empty zero page -- cgit v1.2.3 From 888b9f7c58ebe8303bad817cd554df887a683957 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:11 +0200 Subject: HWPOISON: Handle hardware poisoned pages in try_to_unmap When a page has the poison bit set replace the PTE with a poison entry. This causes the right error handling to be done later when a process runs into it. v2: add a new flag to not do that (needed for the memory-failure handler later) (Fengguang) v3: remove unnecessary is_migration_entry() test (Fengguang, Minchan) Reviewed-by: Minchan Kim Reviewed-by: Wu Fengguang Signed-off-by: Andi Kleen --- include/linux/rmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 4c4a2d4d289..ce989f1fc2e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -93,6 +93,7 @@ enum ttu_flags { TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ + TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) -- cgit v1.2.3 From 750b4987b0cd4d408e54cb83a80a067cbe690feb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 16 Sep 2009 11:50:12 +0200 Subject: HWPOISON: Refactor truncate to allow direct truncating of page v2 Extract out truncate_inode_page() out of the truncate path so that it can be used by memory-failure.c [AK: description, headers, fix typos] v2: Some white space changes from Fengguang Wu Signed-off-by: Andi Kleen --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 082b68cb5ff..8cbc0aafd5b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -794,6 +794,8 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern int vmtruncate(struct inode * inode, loff_t offset); extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); +int truncate_inode_page(struct address_space *mapping, struct page *page); + #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); -- cgit v1.2.3 From 83f786680aec8d030184f7ced1a0a3dd8ac81764 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Sep 2009 11:50:13 +0200 Subject: HWPOISON: Add invalidate_inode_page Add a simple way to invalidate a single page This is just a refactoring of the truncate.c code. Originally from Fengguang, modified by Andi Kleen. Signed-off-by: Andi Kleen --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cbc0aafd5b..b05bbde0296 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -796,6 +796,8 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); +int invalidate_inode_page(struct page *page); + #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); -- cgit v1.2.3 From 257187362123f15d9d1e09918cf87cebbea4e786 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:13 +0200 Subject: HWPOISON: Define a new error_remove_page address space op for async truncation Truncating metadata pages is not safe right now before we haven't audited all file systems. To enable truncation only for data address space define a new address_space callback error_remove_page. This is used for memory_failure.c memory error handling. This can be then set to truncate_inode_page() This patch just defines the new operation and adds documentation. Callers and users come in followon patches. Signed-off-by: Andi Kleen --- include/linux/fs.h | 1 + include/linux/mm.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b21cf6b9c80..4f47afd3764 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -595,6 +595,7 @@ struct address_space_operations { int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); + int (*error_remove_page)(struct address_space *, struct page *); }; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index b05bbde0296..a16018f7d61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -795,6 +795,7 @@ extern int vmtruncate(struct inode * inode, loff_t offset); extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); +int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); -- cgit v1.2.3 From 4db96cf077aa938b11fe7ac79ecc9b29ec00fbab Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:14 +0200 Subject: HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process This allows processes to override their early/late kill behaviour on hardware memory errors. Typically applications which are memory error aware is better of with early kill (see the error as soon as possible), all others with late kill (only see the error when the error is really impacting execution) There's a global sysctl, but this way an application can set its specific policy. We're using two bits, one to signify that the process stated its intention and that I also made the prctl future proof by enforcing the unused arguments are 0. The state is inherited to children. Note this makes us officially run out of process flags on 32bit, but the next patch can easily add another field. Manpage patch will be supplied separately. Signed-off-by: Andi Kleen --- include/linux/prctl.h | 2 ++ include/linux/sched.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index b00df4c79c6..3dc303197e6 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -88,4 +88,6 @@ #define PR_TASK_PERF_COUNTERS_DISABLE 31 #define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_MCE_KILL 33 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f3d74bd04d1..29eae73c951 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1687,6 +1687,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ @@ -1706,6 +1707,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ -- cgit v1.2.3 From 6a46079cf57a7f7758e8b926980a4f852f89b34d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:15 +0200 Subject: HWPOISON: The high level memory error handler in the VM v7 Add the high level memory handler that poisons pages that got corrupted by hardware (typically by a two bit flip in a DIMM or a cache) on the Linux level. The goal is to prevent everyone from accessing these pages in the future. This done at the VM level by marking a page hwpoisoned and doing the appropriate action based on the type of page it is. The code that does this is portable and lives in mm/memory-failure.c To quote the overview comment: High level machine check handler. Handles pages reported by the hardware as being corrupted usually due to a 2bit ECC memory or cache failure. This focuses on pages detected as corrupted in the background. When the current CPU tries to consume corruption the currently running process can just be killed directly instead. This implies that if the error cannot be handled for some reason it's safe to just ignore it because no corruption has been consumed yet. Instead when that happens another machine check will happen. Handles page cache pages in various states. The tricky part here is that we can access any page asynchronous to other VM users, because memory failures could happen anytime and anywhere, possibly violating some of their assumptions. This is why this code has to be extremely careful. Generally it tries to use normal locking rules, as in get the standard locks, even if that means the error handling takes potentially a long time. Some of the operations here are somewhat inefficient and have non linear algorithmic complexity, because the data structures have not been optimized for this case. This is in particular the case for the mapping from a vma to a process. Since this case is expected to be rare we hope we can get away with this. There are in principle two strategies to kill processes on poison: - just unmap the data and wait for an actual reference before killing - kill as soon as corruption is detected. Both have advantages and disadvantages and should be used in different situations. Right now both are implemented and can be switched with a new sysctl vm.memory_failure_early_kill The default is early kill. The patch does some rmap data structure walking on its own to collect processes to kill. This is unusual because normally all rmap data structure knowledge is in rmap.c only. I put it here for now to keep everything together and rmap knowledge has been seeping out anyways Includes contributions from Johannes Weiner, Chris Mason, Fengguang Wu, Nick Piggin (who did a lot of great work) and others. Cc: npiggin@suse.de Cc: riel@redhat.com Signed-off-by: Andi Kleen Acked-by: Rik van Riel Reviewed-by: Hidehiro Kawai --- include/linux/mm.h | 7 +++++++ include/linux/rmap.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a16018f7d61..1ffca03f34b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1309,5 +1309,12 @@ void vmemmap_populate_print_last(void); extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, size_t size); extern void refund_locked_memory(struct mm_struct *mm, size_t size); + +extern void memory_failure(unsigned long pfn, int trapno); +extern int __memory_failure(unsigned long pfn, int trapno, int ref); +extern int sysctl_memory_failure_early_kill; +extern int sysctl_memory_failure_recovery; +extern atomic_long_t mce_bad_pages; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ce989f1fc2e..3c1004e5074 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -129,6 +129,7 @@ int try_to_munlock(struct page *); */ struct anon_vma *page_lock_anon_vma(struct page *page); void page_unlock_anon_vma(struct anon_vma *anon_vma); +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); #else /* !CONFIG_MMU */ -- cgit v1.2.3