From bab5e2d6522bc3cb892c1e8aaafecab05bed9d85 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:22 -0700 Subject: kernel/auditfilter.c: replace count*size kmalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { -- cgit v1.2.3 From 3e584064840de14856451ea9ce443c5d4f7e31ff Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:24 -0700 Subject: fs/fscache: make ctl_table static fscache_sysctls and fscache_sysctls_root are only used in main.c Signed-off-by: Fabian Frederick Cc: David Howells Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fscache/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fscache/main.c b/fs/fscache/main.c index a31b83c5cbd9..b39d487ccfb0 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -struct ctl_table fscache_sysctls[] = { +static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = { {} }; -struct ctl_table fscache_sysctls_root[] = { +static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, -- cgit v1.2.3 From 8ba8fa917093510cdcb4ec8ff8b9603e1b525658 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Aug 2014 16:03:26 -0700 Subject: fsnotify: rename event handling functions Rename fsnotify_add_notify_event() to fsnotify_add_event() since the "notify" part is duplicit. Rename fsnotify_remove_notify_event() and fsnotify_peek_notify_event() to fsnotify_remove_first_event() and fsnotify_peek_first_event() respectively since "notify" part is duplicit and they really look at the first event in the queue. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jan Kara Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 2 +- fs/notify/fanotify/fanotify_user.c | 2 +- fs/notify/inotify/inotify_fsnotify.c | 2 +- fs/notify/inotify/inotify_user.c | 4 ++-- fs/notify/notification.c | 19 ++++++++++--------- include/linux/fsnotify_backend.h | 12 ++++++------ 6 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index ee9cb3795c2b..fdeb36b70c65 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -210,7 +210,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, return -ENOMEM; fsn_event = &event->fse; - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3fdc8a3e1134..fbf2210823ab 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - return fsnotify_remove_notify_event(group); + return fsnotify_remove_first_event(group); } static int create_fd(struct fsnotify_group *group, diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 43ab1e1a07a2..0f88bc0b4e6c 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group, if (len) strcpy(event->name, file_name); - ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cc423a30a0c8..daf76652fe58 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - event = fsnotify_peek_notify_event(group); + event = fsnotify_peek_first_event(group); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - fsnotify_remove_notify_event(group); + fsnotify_remove_first_event(group); return event; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1e58402171a5..1d394220acbe 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -83,10 +83,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * added to the queue, 1 if the event was merged with some other queued event, * 2 if the queue of events has overflown. */ -int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -128,7 +128,7 @@ queue: * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; @@ -140,7 +140,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group struct fsnotify_event, list); /* * We need to init list head for the case of overflow event so that - * check in fsnotify_add_notify_events() works + * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; @@ -149,9 +149,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group } /* - * This will not remove the event, that must be done with fsnotify_remove_notify_event() + * This will not remove the event, that must be done with + * fsnotify_remove_first_event() */ -struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { BUG_ON(!mutex_is_locked(&group->notification_mutex)); @@ -169,7 +170,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { - event = fsnotify_remove_notify_event(group); + event = fsnotify_remove_first_event(group); fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index fc7718c6bd3e..a6e899943363 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -322,16 +322,16 @@ extern int fsnotify_fasync(int fd, struct file *file, int on); extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ -extern int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)); +extern int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)); /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group); +extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ -extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group); +extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* functions used to manipulate the marks attached to inodes */ -- cgit v1.2.3 From 5838d4442bd5971687b72221736222637e03140d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Aug 2014 16:03:28 -0700 Subject: fanotify: fix double free of pending permission events Commit 85816794240b ("fanotify: Fix use after free for permission events") introduced a double free issue for permission events which are pending in group's notification queue while group is being destroyed. These events are freed from fanotify_handle_event() but they are not removed from groups notification queue and thus they get freed again from fsnotify_flush_notify(). Fix the problem by removing permission events from notification queue before freeing them if we skip processing access response. Also expand comments in fanotify_release() to explain group shutdown in detail. Fixes: 85816794240b9659e66e4d9b0df7c6e814e5f603 Signed-off-by: Jan Kara Reported-by: Douglas Leeder Tested-by: Douglas Leeder Reported-by: Heinrich Schuchard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 9 ++++++++- fs/notify/fanotify/fanotify_user.c | 12 ++++++++++++ fs/notify/notification.c | 18 +++++++++++++++++- include/linux/fsnotify_backend.h | 2 ++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index fdeb36b70c65..30d3addfad75 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group, wait_event(group->fanotify_data.access_waitq, event->response || atomic_read(&group->fanotify_data.bypass_perm)); - if (!event->response) /* bypass_perm set */ + if (!event->response) { /* bypass_perm set */ + /* + * Event was canceled because group is being destroyed. Remove + * it from group's event list because we are responsible for + * freeing the permission event. + */ + fsnotify_remove_event(group, &event->fae.fse); return 0; + } /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index fbf2210823ab..b13992a41bd9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + /* + * There may be still new events arriving in the notification queue + * but since userspace cannot use fanotify fd anymore, no event can + * enter or leave access_list by now. + */ spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); @@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file) } spin_unlock(&group->fanotify_data.access_lock); + /* + * Since bypass_perm is set, newly queued events will not wait for + * access response. Wake up the already sleeping ones now. + * synchronize_srcu() in fsnotify_destroy_group() will wait for all + * processes sleeping in fanotify_handle_event() waiting for access + * response and thus also for all permission events to be freed. + */ wake_up(&group->fanotify_data.access_waitq); #endif diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1d394220acbe..a95d8e037aeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - + /* If the event is still queued, we have a problem... */ + WARN_ON(!list_empty(&event->list)); group->ops->free_event(event); } @@ -124,6 +125,21 @@ queue: return ret; } +/* + * Remove @event from group's notification queue. It is the responsibility of + * the caller to destroy the event. + */ +void fsnotify_remove_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + mutex_lock(&group->notification_mutex); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + group->q_len--; + } + mutex_unlock(&group->notification_mutex); +} + /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a6e899943363..ca060d7c4fa6 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -326,6 +326,8 @@ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); +/* Remove passed event from groups notification queue */ +extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event); /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ -- cgit v1.2.3 From 1332429b305044aa75163399ae960c6535828ce6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Aug 2014 16:03:31 -0700 Subject: ./Makefile: explain stack-protector-strong CONFIG logic This adds a hopefully helpful comment above the (seemingly weird) compiler flag selection logic. Signed-off-by: Kees Cook Suggested-by: Andrew Morton Cc: Andi Kleen Cc: Randy Dunlap Cc: Michal Marek Cc: Michal Hocko Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Makefile b/Makefile index d0901b46b4bf..af2f2fcfe01c 100644 --- a/Makefile +++ b/Makefile @@ -636,6 +636,22 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) endif # Handle stack protector mode. +# +# Since kbuild can potentially perform two passes (first with the old +# .config values and then with updated .config values), we cannot error out +# if a desired compiler option is unsupported. If we were to error, kbuild +# could never get to the second pass and actually notice that we changed +# the option to something that was supported. +# +# Additionally, we don't want to fallback and/or silently change which compiler +# flags will be used, since that leads to producing kernels with different +# security feature characteristics depending on the compiler used. ("But I +# selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") +# +# The middle ground is to warn here so that the failed option is obvious, but +# to let the build fail with bad compiler flags so that we can't produce a +# kernel when there is a CONFIG and compiler mismatch. +# ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector ifeq ($(call cc-option, $(stackp-flag)),) -- cgit v1.2.3 From 2122da26c3511b308fe9027fc98d0c9b62035d0f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:33 -0700 Subject: fs/logfs/readwrite.c: kernel-doc warning fixes s/-/:/ and fix variable names. Signed-off-by: Fabian Frederick Cc: Joern Engel Cc: Prasad Joshi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/logfs/readwrite.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 48140315f627..380d86e1ab45 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) /** * logfs_is_valid_block - check whether this block is still valid * - * @sb - superblock - * @ofs - block physical offset - * @ino - block inode number - * @bix - block index - * @level - block level + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level * * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will * become invalid once the journal is written. @@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block) * * @inode: parent inode (ifile or directory) * @buf: object to write (inode or dentry) - * @n: object size - * @_pos: object number (file position in blocks/objects) + * @count: object size + * @bix: block index * @flags: write flags - * @lock: 0 if write lock is already taken, 1 otherwise * @shadow_tree: shadow below this inode * * FIXME: All caller of this put a 200-300 byte variable on the stack, -- cgit v1.2.3 From 2502722ddedc4f0a110646134d7937287c904c68 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:35 -0700 Subject: ntfs: kernel-doc warning fixes cached_page and lru_pvec were removed from ntfs_attr_extend_initialized in commit 2ec93b0bf35f ("ntfs: clean up ntfs_attr_extend_initialized") lru_pvec has been removed from __ntfs_grab_cache_pages in commit 4c99000ac47c ("ntfs: use add_to_page_cache_lru()") Signed-off-by: Fabian Frederick Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/file.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5c9e2c81cb11..f5ec1ce7a532 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * ntfs_attr_extend_initialized - extend the initialized size of an attribute * @ni: ntfs inode of the attribute to extend * @new_init_size: requested new initialized size in bytes - * @cached_page: store any allocated but unused page here - * @lru_pvec: lru-buffering pagevec of the caller * * Extend the initialized size of an attribute described by the ntfs inode @ni * to @new_init_size bytes. This involves zeroing any non-sparse space between @@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * @nr_pages: number of page cache pages to obtain * @pages: array of pages in which to return the obtained page cache pages * @cached_page: allocated but as yet unused page - * @lru_pvec: lru-buffering pagevec of caller * * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. -- cgit v1.2.3 From 8cdea0602c0df194dc52af0bb47bd8791db681f7 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 6 Aug 2014 16:03:37 -0700 Subject: score, ptrace: remove unused macros 'COUNTER' and other same kind macros are too common to use, and easy to get conflict with other modules. At present, they are not used, so it is OK to simply remove them. And the related warning (allmodconfig with score): CC [M] drivers/md/raid1.o In file included from drivers/md/raid1.c:42:0: drivers/md/bitmap.h:93:0: warning: "COUNTER" redefined #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) ^ In file included from ./arch/score/include/asm/ptrace.h:4:0, from include/linux/sched.h:31, from include/linux/blkdev.h:4, from drivers/md/raid1.c:36: ./arch/score/include/uapi/asm/ptrace.h:13:0: note: this is the location of the previous definition #define COUNTER 38 Signed-off-by: Chen Gang Signed-off-by: David Rientjes Cc: Lennox Wu Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/score/include/uapi/asm/ptrace.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/score/include/uapi/asm/ptrace.h b/arch/score/include/uapi/asm/ptrace.h index f59771a3f127..5c5e794058be 100644 --- a/arch/score/include/uapi/asm/ptrace.h +++ b/arch/score/include/uapi/asm/ptrace.h @@ -4,17 +4,6 @@ #define PTRACE_GETREGS 12 #define PTRACE_SETREGS 13 -#define PC 32 -#define CONDITION 33 -#define ECR 34 -#define EMA 35 -#define CEH 36 -#define CEL 37 -#define COUNTER 38 -#define LDCR 39 -#define STCR 40 -#define PSR 41 - #define SINGLESTEP16_INSN 0x7006 #define SINGLESTEP32_INSN 0x840C8000 #define BREAKPOINT16_INSN 0x7002 /* work on SPG300 */ -- cgit v1.2.3 From 455f9726244cb5de2b6a99f540743ba155633cc8 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 6 Aug 2014 16:03:39 -0700 Subject: sh: remove CPU_SUBTYPE_SH7764 The symbol is an orphan, get rid of it. Submitted by Richard a few months ago as "[PATCH 21/28] Remove CPU_SUBTYPE_SH7764". [pebolle@tiscali.nl: re-added dropped ||] Signed-off-by: Richard Weinberger Signed-off-by: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/drivers/dma/Kconfig | 5 ++--- arch/sh/include/cpu-sh4/cpu/dma-register.h | 1 - arch/sh/include/cpu-sh4a/cpu/dma.h | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig index cfd5b90a8628..78bc97b1d027 100644 --- a/arch/sh/drivers/dma/Kconfig +++ b/arch/sh/drivers/dma/Kconfig @@ -12,9 +12,8 @@ config SH_DMA_IRQ_MULTI default y if CPU_SUBTYPE_SH7750 || CPU_SUBTYPE_SH7751 || \ CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \ CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091 || \ - CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7764 || \ - CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ - CPU_SUBTYPE_SH7760 + CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ + CPU_SUBTYPE_SH7785 || CPU_SUBTYPE_SH7760 config SH_DMA_API depends on SH_DMA diff --git a/arch/sh/include/cpu-sh4/cpu/dma-register.h b/arch/sh/include/cpu-sh4/cpu/dma-register.h index 02788b6a03b7..9cd81e54056a 100644 --- a/arch/sh/include/cpu-sh4/cpu/dma-register.h +++ b/arch/sh/include/cpu-sh4/cpu/dma-register.h @@ -32,7 +32,6 @@ #define CHCR_TS_HIGH_SHIFT (20 - 2) /* 2 bits for shifted low TS */ #elif defined(CONFIG_CPU_SUBTYPE_SH7757) || \ defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) || \ defined(CONFIG_CPU_SUBTYPE_SH7780) || \ defined(CONFIG_CPU_SUBTYPE_SH7785) #define CHCR_TS_LOW_MASK 0x00000018 diff --git a/arch/sh/include/cpu-sh4a/cpu/dma.h b/arch/sh/include/cpu-sh4a/cpu/dma.h index 89afb650ce25..8ceccceae844 100644 --- a/arch/sh/include/cpu-sh4a/cpu/dma.h +++ b/arch/sh/include/cpu-sh4a/cpu/dma.h @@ -14,8 +14,7 @@ #define DMTE4_IRQ evt2irq(0xb80) #define DMAE0_IRQ evt2irq(0xbc0) /* DMA Error IRQ*/ #define SH_DMAC_BASE0 0xFE008020 -#elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) +#elif defined(CONFIG_CPU_SUBTYPE_SH7763) #define DMTE0_IRQ evt2irq(0x640) #define DMTE4_IRQ evt2irq(0x780) #define DMAE0_IRQ evt2irq(0x6c0) -- cgit v1.2.3 From 88f9802c4f37dd6be588feae24f74e26aacb1779 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:41 -0700 Subject: arch/sh/mm/asids-debugfs.c: use PTR_ERR_OR_ZERO Replace IS_ERR/PTR_ERR. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/asids-debugfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c index 74c03ecc4871..ecfc6b0c1da1 100644 --- a/arch/sh/mm/asids-debugfs.c +++ b/arch/sh/mm/asids-debugfs.c @@ -67,10 +67,8 @@ static int __init asids_debugfs_init(void) NULL, &asids_debugfs_fops); if (!asids_dentry) return -ENOMEM; - if (IS_ERR(asids_dentry)) - return PTR_ERR(asids_dentry); - return 0; + return PTR_ERR_OR_ZERO(asids_dentry); } module_init(asids_debugfs_init); -- cgit v1.2.3 From 7c0368211de94c1bf646f8de11c9b0dee9d417d9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:43 -0700 Subject: arch/sh/kernel/time.c: use PTR_ERR_OR_ZERO Replace IS_ERR/PTR_ERR. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/time.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 552c8fcf9416..d6d0a986c6e9 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -80,10 +80,8 @@ static int __init rtc_generic_init(void) return -ENODEV; pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - return 0; + return PTR_ERR_OR_ZERO(pdev); } module_init(rtc_generic_init); -- cgit v1.2.3 From d72849c35b655f8d140fe4bf72c9ce969d131a96 Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Wed, 6 Aug 2014 16:03:45 -0700 Subject: sh: SH7724 clock fixes Fix the device name for the CMT. Add clocks called usb0 and usb1 so that r8a66597_hcd works again on the ecovec24 board Signed-off-by: Daniel Palmer Cc: Kuninori Morimoto Cc: Richard Weinberger Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/cpu/sh4a/clock-sh7724.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c index f579dd528198..c187b9579c21 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c @@ -307,7 +307,7 @@ static struct clk_lookup lookups[] = { CLKDEV_ICK_ID("fck", "sh-tmu.0", &mstp_clks[HWBLK_TMU0]), CLKDEV_ICK_ID("fck", "sh-tmu.1", &mstp_clks[HWBLK_TMU1]), - CLKDEV_ICK_ID("fck", "sh-cmt-16.0", &mstp_clks[HWBLK_CMT]), + CLKDEV_ICK_ID("fck", "sh-cmt-32.0", &mstp_clks[HWBLK_CMT]), CLKDEV_DEV_ID("sh-wdt.0", &mstp_clks[HWBLK_RWDT]), CLKDEV_DEV_ID("sh-dma-engine.1", &mstp_clks[HWBLK_DMAC1]), @@ -332,6 +332,8 @@ static struct clk_lookup lookups[] = { CLKDEV_CON_ID("tsif0", &mstp_clks[HWBLK_TSIF]), CLKDEV_DEV_ID("renesas_usbhs.1", &mstp_clks[HWBLK_USB1]), CLKDEV_DEV_ID("renesas_usbhs.0", &mstp_clks[HWBLK_USB0]), + CLKDEV_CON_ID("usb1", &mstp_clks[HWBLK_USB1]), + CLKDEV_CON_ID("usb0", &mstp_clks[HWBLK_USB0]), CLKDEV_CON_ID("2dg0", &mstp_clks[HWBLK_2DG]), CLKDEV_DEV_ID("sh_mobile_sdhi.0", &mstp_clks[HWBLK_SDHI0]), CLKDEV_DEV_ID("sh_mobile_sdhi.1", &mstp_clks[HWBLK_SDHI1]), -- cgit v1.2.3 From e04aca4a769e16cf4f9b8a4bd3e761711640dc46 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 6 Aug 2014 16:03:48 -0700 Subject: sh: fix build error by adding generic ioport_{map/unmap}() Fix build error as reported by Geert Uytterhoeven here: http://kisskb.ellerman.id.au/kisskb/buildresult/11607865/ The error happens when CONFIG_HAS_IOPORT_MAP=n because of which there are missing definitions of ioport_map/unmap(). Fix this build error by adding these prototypes. Signed-off-by: Pranith Kumar Reported-by: Geert Uytterhoeven Cc: [3.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/io_noioport.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h index 4d48f1436a63..c727e6ddf69e 100644 --- a/arch/sh/include/asm/io_noioport.h +++ b/arch/sh/include/asm/io_noioport.h @@ -34,6 +34,17 @@ static inline void outl(unsigned int x, unsigned long port) BUG(); } +static inline void __iomem *ioport_map(unsigned long port, unsigned int size) +{ + BUG(); + return NULL; +} + +static inline void ioport_unmap(void __iomem *addr) +{ + BUG(); +} + #define inb_p(addr) inb(addr) #define inw_p(addr) inw(addr) #define inl_p(addr) inl(addr) -- cgit v1.2.3 From 14694888db2c1f7ba1f2e6172df45813b4db7880 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:50 -0700 Subject: fs/squashfs/file_direct.c: replace count*size kmalloc by kmalloc_array kmalloc_array() manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Phillip Lougher Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/file_direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 62a0de6632e1..43e7a7eddac0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) pages = end_index - start_index + 1; - page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; -- cgit v1.2.3 From c811f5f41e37d16b2839988426ffde78d2273d48 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:52 -0700 Subject: fs/squashfs/super.c: logging cleanup - Convert printk to pr_foo() - Add pr_fmt for future logging entries - Coalesce formats Signed-off-by: Fabian Frederick Cc: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 031c8d67fd51..5056babe00df 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -27,6 +27,8 @@ * the filesystem. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " - "Phillip Lougher\n"); + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } -- cgit v1.2.3 From 981035b47d7da8ba7c153ed431bf515f593853d8 Mon Sep 17 00:00:00 2001 From: Yingtai Xie Date: Wed, 6 Aug 2014 16:03:54 -0700 Subject: ocfs2: correctly check the return value of ocfs2_search_extent_list ocfs2_search_extent_list may return -1, so we should check the return value in ocfs2_split_and_insert, otherwise it may cause array index out of bound. And ocfs2_search_extent_list can only return value less than el->l_next_free_rec, so check if it is equal or larger than le16_to_cpu(el->l_next_free_rec) is meaningless. Signed-off-by: Yingtai Xie Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 15 ++++++++++++--- fs/ocfs2/move_extents.c | 2 +- fs/ocfs2/refcounttree.c | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9d8fcf2f3b94..a93bf9892256 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4961,6 +4961,15 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu: split at cpos %u lost record.", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 599eb4c4c8be..6219aaadeb08 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 636aab69ead5..d81f6e2a97f5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", -- cgit v1.2.3 From 7567c148835f7ac16edf662d65816640cf8e20bb Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 6 Aug 2014 16:03:56 -0700 Subject: ocfs2: remove conversion of total_backoff in dlm_join_domain() The unit of total_backoff is msecs not jiffies, so no need to do the conversion. Otherwise, the join timeout is not 90 sec. Signed-off-by: Yiwen Jiang Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 39efc5057a36..3fcf205ee900 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } -- cgit v1.2.3 From bba1cb17d9adb8c074f138c1cc039fb2ee8c964c Mon Sep 17 00:00:00 2001 From: Tariq Saeed Date: Wed, 6 Aug 2014 16:03:58 -0700 Subject: ocfs2: race between umount and unfinished remastering during recovery Orabug: 19074140 When umount is issued during recovery on the new master that has not finished remastering locks, it triggers BUG() in dlm_send_mig_lockres_msg(). Here is the situation: 1) node A has a lock on resource X mastered by node B. 2) node B dies -> node A sets recovering flag for res X 3) Node C becomes the new master for resources owned by the dead node and is remastering locks of the dead node but has not finished the remastering process yet. 4) umount is issued on node C. 5) During processing of umount, ignoring unfished recovery, node C attempts to migrate resource X to node A. 6) node A finds res X in DLM_LOCK_RES_RECOVERING state, considers it a logic error and sends back -EFAULT. 7) node C asserts BUG() upon seeing EFAULT resp from node B. Fix is to delay migrating res X till remastering is finished at which point recovering flag will be cleared on both A and C. Signed-off-by: Tariq Saeed Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 82abf0cc9a12..3ec906ef5d9a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2405,6 +2405,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, if (res->state & DLM_LOCK_RES_MIGRATING) return 0; + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + if (res->owner != dlm->node_num) return 0; -- cgit v1.2.3 From 1b7f8ba603d3bb2f5bbb65df981ae0d0abb80ee2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:01 -0700 Subject: fs/ocfs2/slot_map.c: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c151cccc..a88b2a4fcc85 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; -- cgit v1.2.3 From 656c3b79f782a235413087168b61ff279034d860 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:03 -0700 Subject: kernel/watchdog.c: convert printk/pr_warning to pr_foo() Replace some obsolete functions. Signed-off-by: Fabian Frederick Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..51b29e9d2ba6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -484,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", -- cgit v1.2.3 From 1536cb39338aff16b0e30cc6708da03b268337f7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:05 -0700 Subject: mm/slab.c: add __init to init_lock_keys init_lock_keys is only called by __init kmem_cache_init_late Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 3070b929a1bf..18ac44b7558d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -569,7 +569,7 @@ static inline void on_slab_lock_classes(struct kmem_cache *cachep) on_slab_lock_classes_node(cachep, node); } -static inline void init_lock_keys(void) +static inline void __init init_lock_keys(void) { int node; @@ -577,7 +577,7 @@ static inline void init_lock_keys(void) init_node_lock_keys(node); } #else -static void init_node_lock_keys(int q) +static void __init init_node_lock_keys(int q) { } -- cgit v1.2.3 From 44c5356fb460053112ab87c9601df1605054edca Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:07 -0700 Subject: slab common: add functions for kmem_cache_node access The patchset provides two new functions in mm/slab.h and modifies SLAB and SLUB to use these. The kmem_cache_node structure is shared between both allocators and the use of common accessors will allow us to move more code into slab_common.c in the future. This patch (of 3): These functions allow to eliminate repeatedly used code in both SLAB and SLUB and also allow for the insertion of debugging code that may be needed in the development process. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Acked-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 17 ++++++++++++++++- mm/slub.c | 5 ----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 961a3fb1f5a2..3f9766e393a3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -262,7 +262,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) } #endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -294,5 +294,20 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ + if (__n) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); diff --git a/mm/slub.c b/mm/slub.c index 73004808537e..2569802aa7cc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) -- cgit v1.2.3 From fa45dc254bcf740852752effa35387be684947f8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:09 -0700 Subject: slub: use new node functions Make use of the new node functions in mm/slab.h to reduce code size and simplify. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 78 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 49 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2569802aa7cc..3918cd62a4b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2157,6 +2157,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@ -2171,15 +2172,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -2923,13 +2920,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3217,12 +3211,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3407,9 +3400,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3581,6 +3572,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3590,19 +3582,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3955,16 +3944,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4118,6 +4105,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4127,8 +4115,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4327,8 +4314,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4344,9 +4332,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4359,7 +4347,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -4373,16 +4361,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -5337,13 +5321,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); -- cgit v1.2.3 From 18bf854117c6caa4d0083bd42411895163467cb9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Aug 2014 16:04:11 -0700 Subject: slab: use get_node() and kmem_cache_node() functions Use the two functions to simplify the code avoiding numerous explicit checks coded checking for a certain node to be online. Get rid of various repeated calculations of kmem_cache_node structures. [akpm@linux-foundation.org: fix build] Signed-off-by: Christoph Lameter Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 173 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 80 insertions(+), 93 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 18ac44b7558d..66b3ffbb890d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -267,7 +267,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -488,16 +488,11 @@ static struct lock_class_key debugobj_alc_key; static void slab_set_lock_classes(struct kmem_cache *cachep, struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) + struct kmem_cache_node *n) { struct array_cache **alc; - struct kmem_cache_node *n; int r; - n = cachep->node[q]; - if (!n) - return; - lockdep_set_class(&n->list_lock, l3_key); alc = n->alien; /* @@ -515,17 +510,19 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, } } -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); + slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, n); } static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); + for_each_kmem_cache_node(cachep, node, n) + slab_set_debugobj_lock_classes_node(cachep, n); } static void init_node_lock_keys(int q) @@ -542,31 +539,30 @@ static void init_node_lock_keys(int q) if (!cache) continue; - n = cache->node[q]; + n = get_node(cache, q); if (!n || OFF_SLAB(cache)) continue; slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); + &on_slab_alc_key, n); } } -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +static void on_slab_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { - if (!cachep->node[q]) - return; - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); + &on_slab_alc_key, n); } static inline void on_slab_lock_classes(struct kmem_cache *cachep) { int node; + struct kmem_cache_node *n; VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); + for_each_kmem_cache_node(cachep, node, n) + on_slab_lock_classes_node(cachep, n); } static inline void __init init_lock_keys(void) @@ -589,11 +585,13 @@ static inline void on_slab_lock_classes(struct kmem_cache *cachep) { } -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { } -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, + struct kmem_cache_node *n) { } @@ -826,7 +824,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -881,7 +879,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -1031,7 +1029,7 @@ static void free_alien_cache(struct array_cache **ac_ptr) static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -1099,7 +1097,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(nodeid == node)) return 0; - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; @@ -1111,9 +1109,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) ac_put_obj(cachep, alien, objp); spin_unlock(&alien->lock); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + spin_unlock(&n->list_lock); } return 1; } @@ -1140,7 +1139,8 @@ static int init_cache_node_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; @@ -1156,11 +1156,11 @@ static int init_cache_node_node(int node) cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1186,7 +1186,7 @@ static void cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) goto free_array_cache; @@ -1229,7 +1229,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1284,7 +1284,7 @@ static int cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1306,10 +1306,10 @@ static int cpuup_prepare(long cpu) kfree(shared); free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); + slab_set_debugobj_lock_classes_node(cachep, n); else if (!OFF_SLAB(cachep) && !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); + on_slab_lock_classes_node(cachep, n); } init_node_lock_keys(node); @@ -1395,7 +1395,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1690,14 +1690,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -2434,7 +2430,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2442,7 +2438,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2462,12 +2458,14 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + spin_unlock(&n->list_lock); ac->avail = 0; } @@ -2478,17 +2476,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2534,17 +2527,14 @@ out: int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2566,13 +2556,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(cachep->array[i]); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } @@ -2751,7 +2739,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2920,7 +2908,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -3169,8 +3157,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3233,7 +3221,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3304,7 +3292,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3420,7 +3408,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; + n = get_node(cachep, node); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3462,7 +3450,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3775,7 +3763,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; @@ -3820,9 +3808,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3884,11 +3871,17 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, for_each_online_cpu(i) { struct array_cache *ccold = new->new[i]; + int node; + struct kmem_cache_node *n; + if (!ccold) continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); + + node = cpu_to_mem(i); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ccold->entry, ccold->avail, node); + spin_unlock_irq(&n->list_lock); kfree(ccold); } kfree(new); @@ -4048,7 +4041,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -4100,10 +4093,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4328,10 +4318,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); -- cgit v1.2.3 From 5240ab4076bd3815473f2f2991741acc698f8b58 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:14 -0700 Subject: mm: slab.h: wrap the whole file with guarding macro Guarding section: #ifndef MM_SLAB_H #define MM_SLAB_H ... #endif currently doesn't cover the whole mm/slab.h. It seems like it was done unintentionally. Wrap the whole file by moving closing #endif to the end of it. Signed-off-by: Andrey Ryabinin Acked-by: Christoph Lameter Acked-by: David Rientjes Reviewed-by: Vladimir Davydov Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index 3f9766e393a3..3822b65edcc2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -260,7 +260,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) WARN_ON_ONCE(1); return s; } -#endif #ifndef CONFIG_SLOB /* @@ -311,3 +310,5 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ -- cgit v1.2.3 From c07b8183cbb86d34007e5a3935e0ec89f5bb83c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:04:16 -0700 Subject: mm, slub: mark resiliency_test as init text resiliency_test() is only called for bootstrap, so it may be moved to init.text and freed after boot. Signed-off-by: David Rientjes Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3918cd62a4b2..2d61503efb92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4187,7 +4187,7 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; -- cgit v1.2.3 From 02e72cc61713185013d958baba508288ba2a0157 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:18 -0700 Subject: mm: slub: SLUB_DEBUG=n: use the same alloc/free hooks as for SLUB_DEBUG=y There are two versions of alloc/free hooks now - one for CONFIG_SLUB_DEBUG=y and another one for CONFIG_SLUB_DEBUG=n. I see no reason why calls to other debugging subsystems (LOCKDEP, DEBUG_ATOMIC_SLEEP, KMEMCHECK and FAILSLAB) are hidden under SLUB_DEBUG. All this features should work regardless of SLUB_DEBUG config, as all of them already have own Kconfig options. This also fixes failslab for CONFIG_SLUB_DEBUG=n configuration. It simply has not worked before because should_failslab() call was in a hook hidden under "#ifdef CONFIG_SLUB_DEBUG #else". Note: There is one concealed change in allocation path for SLUB_DEBUG=n and all other debugging features disabled. The might_sleep_if() call can generate some code even if DEBUG_ATOMIC_SLEEP=n. For PREEMPT_VOLUNTARY=y might_sleep() inserts _cond_resched() call, but I think it should be ok. Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 97 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 36 insertions(+), 61 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2d61503efb92..92d8139c556d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -939,60 +939,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } } -/* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - /* * Tracking of fully allocated slabs for debugging purposes. */ @@ -1277,6 +1223,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1288,21 +1240,44 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) + return should_failslab(s->object_size, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing -- cgit v1.2.3 From 8a9c61d4381c5e5007cc68e023940b18fa0808d7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:20 -0700 Subject: slab: add unlikely macro to help compiler This patchset does some cleanup and tries to remove lockdep annotation. Patches 1~2 are just for really really minor improvement. Patches 3~9 are for clean-up and removing lockdep annotation. There are two cases that lockdep annotation is needed in SLAB. 1) holding two node locks 2) holding two array cache(alien cache) locks I looked at the code and found that we can avoid these cases without any negative effect. 1) occurs if freeing object makes new free slab and we decide to destroy it. Although we don't need to hold the lock during destroying a slab, current code do that. Destroying a slab without holding the lock would help the reduction of the lock contention. To do it, I change the implementation that new free slab is destroyed after releasing the lock. 2) occurs on similar situation. When we free object from non-local node, we put this object to alien cache with holding the alien cache lock. If alien cache is full, we try to flush alien cache to proper node cache, and, in this time, new free slab could be made. Destroying it would be started and we will free metadata object which comes from another node. In this case, we need another node's alien cache lock to free object. This forces us to hold two array cache locks and then we need lockdep annotation although they are always different locks and deadlock cannot be possible. To prevent this situation, I use same way as 1). In this way, we can avoid 1) and 2) cases, and then, can remove lockdep annotation. As short stat noted, this makes SLAB code much simpler. This patch (of 9): slab_should_failslab() is called on every allocation, so to optimize it is reasonable. We normally don't allocate from kmem_cache. It is just used when new kmem_cache is created, so it's very rare case. Therefore, add unlikely macro to help compiler optimization. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 66b3ffbb890d..7d07942b9804 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3048,7 +3048,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); -- cgit v1.2.3 From 25c063fbd5512eb7190bf5af88351109aededb3f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:22 -0700 Subject: slab: move up code to get kmem_cache_node in free_block() node isn't changed, so we don't need to retreive this structure everytime we move the object. Maybe compiler do this optimization, but making it explicitly is better. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 7d07942b9804..205632c94a6a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3398,7 +3398,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3408,7 +3408,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = get_node(cachep, node); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); -- cgit v1.2.3 From 97654dfa20caa5e6c1b0a4af715aabaf5d070d69 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:25 -0700 Subject: slab: defer slab_destroy in free_block() In free_block(), if freeing object makes new free slab and number of free_objects exceeds free_limit, we start to destroy this new free slab with holding the kmem_cache node lock. Holding the lock is useless and, generally, holding a lock as least as possible is good thing. I never measure performance effect of this, but we'd be better not to hold the lock as much as possible. Commented by Christoph: This is also good because kmem_cache_free is no longer called while holding the node lock. So we avoid one case of recursion. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 60 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 205632c94a6a..f6ad8d335be7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -242,7 +242,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -1030,6 +1031,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { struct kmem_cache_node *n = get_node(cachep, node); + LIST_HEAD(list); if (ac->avail) { spin_lock(&n->list_lock); @@ -1041,9 +1043,10 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &list); ac->avail = 0; spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -1087,6 +1090,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; + LIST_HEAD(list); node = numa_mem_id(); @@ -1111,8 +1115,9 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, &list); spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } @@ -1182,6 +1187,7 @@ static void cpuup_canceled(long cpu) struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; @@ -1196,7 +1202,7 @@ static void cpuup_canceled(long cpu) /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &list); if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1206,7 +1212,7 @@ static void cpuup_canceled(long cpu) shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1221,6 +1227,7 @@ static void cpuup_canceled(long cpu) free_alien_cache(alien); } free_array_cache: + slabs_destroy(cachep, &list); kfree(nc); } /* @@ -2056,6 +2063,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2459,13 +2476,15 @@ static void do_drain(void *arg) struct array_cache *ac; int node = numa_mem_id(); struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); spin_lock(&n->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &list); spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -3393,9 +3412,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) /* * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; struct kmem_cache_node *n = get_node(cachep, node); @@ -3418,13 +3438,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3443,6 +3457,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG @@ -3464,7 +3479,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3485,6 +3500,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3765,12 +3781,13 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3780,6 +3797,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3869,6 +3887,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { + LIST_HEAD(list); struct array_cache *ccold = new->new[i]; int node; struct kmem_cache_node *n; @@ -3879,8 +3898,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(i); n = get_node(cachep, node); spin_lock_irq(&n->list_lock); - free_block(cachep, ccold->entry, ccold->avail, node); + free_block(cachep, ccold->entry, ccold->avail, node, &list); spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(ccold); } kfree(new); @@ -3988,6 +4008,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -4000,12 +4021,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } -- cgit v1.2.3 From 1fe00d50a9e81150de5000490b87ed227525cf09 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:27 -0700 Subject: slab: factor out initialization of array cache Factor out initialization of array cache to use it in following patch. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index f6ad8d335be7..8d9a0fff160d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -791,13 +791,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -805,15 +800,25 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; + spin_lock_init(&ac->lock); } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + int memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) -- cgit v1.2.3 From c8522a3a5832b843570a3315674f5a3575958a51 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:29 -0700 Subject: slab: introduce alien_cache Currently, we use array_cache for alien_cache. Although they are mostly similar, there is one difference, that is, need for spinlock. We don't need spinlock for array_cache itself, but to use array_cache for alien_cache, array_cache structure should have spinlock. This is needless overhead, so removing it would be better. This patch prepare it by introducing alien_cache and using it. In the following patch, we remove spinlock in array_cache. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 108 ++++++++++++++++++++++++++++++++++++++------------------------ mm/slab.h | 2 +- 2 files changed, 68 insertions(+), 42 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 8d9a0fff160d..de91d6f3a2a4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -203,6 +203,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -491,7 +496,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, struct lock_class_key *l3_key, struct lock_class_key *alc_key, struct kmem_cache_node *n) { - struct array_cache **alc; + struct alien_cache **alc; int r; lockdep_set_class(&n->list_lock, l3_key); @@ -507,7 +512,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, return; for_each_node(r) { if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); + lockdep_set_class(&(alc[r]->ac.lock), alc_key); } } @@ -965,12 +970,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return (struct alien_cache **)BAD_ALIEN_MAGIC; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -996,40 +1002,52 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + int memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; + struct alien_cache **alc_ptr; int memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, @@ -1063,25 +1081,31 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; - - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&ac->lock)) { + __drain_alien_cache(cachep, ac, node); + spin_unlock_irq(&ac->lock); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { + alc = alien[i]; + if (alc) { + ac = &alc->ac; spin_lock_irqsave(&ac->lock, flags); __drain_alien_cache(cachep, ac, i); spin_unlock_irqrestore(&ac->lock, flags); @@ -1093,7 +1117,8 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; + struct alien_cache *alien = NULL; + struct array_cache *ac; int node; LIST_HEAD(list); @@ -1110,13 +1135,14 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; - spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + ac = &alien->ac; + spin_lock(&ac->lock); + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, nodeid); } - ac_put_obj(cachep, alien, objp); - spin_unlock(&alien->lock); + ac_put_obj(cachep, ac, objp); + spin_unlock(&ac->lock); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); @@ -1191,7 +1217,7 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct alien_cache **alien; LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ @@ -1272,7 +1298,7 @@ static int cpuup_prepare(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount, GFP_KERNEL); @@ -3762,7 +3788,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { diff --git a/mm/slab.h b/mm/slab.h index 3822b65edcc2..928823e17e58 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,7 +276,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif -- cgit v1.2.3 From 49dfc304ba241b315068023962004542c5118103 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:31 -0700 Subject: slab: use the lock on alien_cache, instead of the lock on array_cache Now, we have separate alien_cache structure, so it'd be better to hold the lock on alien_cache while manipulating alien_cache. After that, we don't need the lock on array_cache, so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index de91d6f3a2a4..e4ce73c32a7a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -191,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -512,7 +511,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, return; for_each_node(r) { if (alc[r]) - lockdep_set_class(&(alc[r]->ac.lock), alc_key); + lockdep_set_class(&(alc[r]->lock), alc_key); } } @@ -811,7 +810,6 @@ static void init_arraycache(struct array_cache *ac, int limit, int batch) ac->limit = limit; ac->batchcount = batch; ac->touched = 0; - spin_lock_init(&ac->lock); } } @@ -1010,6 +1008,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); return alc; } @@ -1086,9 +1085,9 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) if (alc) { ac = &alc->ac; - if (ac->avail && spin_trylock_irq(&ac->lock)) { + if (ac->avail && spin_trylock_irq(&alc->lock)) { __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + spin_unlock_irq(&alc->lock); } } } @@ -1106,9 +1105,9 @@ static void drain_alien_cache(struct kmem_cache *cachep, alc = alien[i]; if (alc) { ac = &alc->ac; - spin_lock_irqsave(&ac->lock, flags); + spin_lock_irqsave(&alc->lock, flags); __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + spin_unlock_irqrestore(&alc->lock, flags); } } } @@ -1136,13 +1135,13 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; ac = &alien->ac; - spin_lock(&ac->lock); + spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, nodeid); } ac_put_obj(cachep, ac, objp); - spin_unlock(&ac->lock); + spin_unlock(&alien->lock); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); @@ -1613,10 +1612,6 @@ void __init kmem_cache_init(void) memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; @@ -1626,10 +1621,6 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } -- cgit v1.2.3 From 833b706cc8b7b555e18d3426e9616bd066883a7a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:33 -0700 Subject: slab: destroy a slab without holding any alien cache lock I haven't heard that this alien cache lock is contended, but to reduce chance of contention would be better generally. And with this change, we can simplify complex lockdep annotation in slab code. In the following patch, it will be implemented. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index e4ce73c32a7a..e4dc0896b891 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1050,10 +1050,10 @@ static void free_alien_cache(struct alien_cache **alc_ptr) } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { struct kmem_cache_node *n = get_node(cachep, node); - LIST_HEAD(list); if (ac->avail) { spin_lock(&n->list_lock); @@ -1065,10 +1065,9 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node, &list); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); } } @@ -1086,8 +1085,11 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) if (alc) { ac = &alc->ac; if (ac->avail && spin_trylock_irq(&alc->lock)) { - __drain_alien_cache(cachep, ac, node); + LIST_HEAD(list); + + __drain_alien_cache(cachep, ac, node, &list); spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); } } } @@ -1104,10 +1106,13 @@ static void drain_alien_cache(struct kmem_cache *cachep, for_each_online_node(i) { alc = alien[i]; if (alc) { + LIST_HEAD(list); + ac = &alc->ac; spin_lock_irqsave(&alc->lock, flags); - __drain_alien_cache(cachep, ac, i); + __drain_alien_cache(cachep, ac, i, &list); spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } @@ -1138,10 +1143,11 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, nodeid); + __drain_alien_cache(cachep, ac, nodeid, &list); } ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { n = get_node(cachep, nodeid); spin_lock(&n->list_lock); -- cgit v1.2.3 From 367f7f2f45e7f601bcf87aeffb0c81e6d26e53df Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:35 -0700 Subject: slab: remove a useless lockdep annotation Now, there is no code to hold two lock simultaneously, since we don't call slab_destroy() with holding any lock. So, lockdep annotation is useless now. Remove it. v2: don't remove BAD_ALIEN_MAGIC in this patch. It will be removed in the following patch. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 153 -------------------------------------------------------------- 1 file changed, 153 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index e4dc0896b891..630c85469164 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -472,139 +472,6 @@ static struct kmem_cache kmem_cache_boot = { #define BAD_ALIEN_MAGIC 0x01020304ul -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - struct kmem_cache_node *n) -{ - struct alien_cache **alc; - int r; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&(alc[r]->lock), alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, n); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(cachep, node, n) - slab_set_debugobj_lock_classes_node(cachep, n); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = get_node(cache, q); - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, n); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, n); -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ - int node; - struct kmem_cache_node *n; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_kmem_cache_node(cachep, node, n) - on_slab_lock_classes_node(cachep, n); -} - -static inline void __init init_lock_keys(void) -{ - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void __init init_node_lock_keys(int q) -{ -} - -static inline void init_lock_keys(void) -{ -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} - -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -1348,13 +1215,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, n); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, n); } - init_node_lock_keys(node); return 0; bad: @@ -1663,9 +1524,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -2446,17 +2304,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } -- cgit v1.2.3 From a640616822b2c3a8009b0600f20c4a76ea8a0025 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:38 -0700 Subject: slab: remove BAD_ALIEN_MAGIC BAD_ALIEN_MAGIC value isn't used anymore. So remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 630c85469164..42a9eddb61cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -470,8 +470,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -838,7 +836,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) -- cgit v1.2.3 From 5e804789673114c616816f8387169790afe376b5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:04:40 -0700 Subject: slab: change int to size_t for representing allocation size It is better to represent allocation size in size_t rather than int. So change it. Signed-off-by: Joonsoo Kim Suggested-by: Andrew Morton Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 42a9eddb61cd..1351725f7936 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -681,7 +681,7 @@ static void init_arraycache(struct array_cache *ac, int limit, int batch) static struct array_cache *alloc_arraycache(int node, int entries, int batchcount, gfp_t gfp) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); @@ -868,7 +868,7 @@ static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct alien_cache *__alloc_alien_cache(int node, int entries, int batch, gfp_t gfp) { - int memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); @@ -880,7 +880,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { struct alien_cache **alc_ptr; - int memsize = sizeof(void *) * nr_node_ids; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) @@ -1037,7 +1037,7 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* -- cgit v1.2.3 From 54266640709a24c9844245d0d9f36b9cb1f31326 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 6 Aug 2014 16:04:42 -0700 Subject: slub: avoid duplicate creation on the first object When a kmem_cache is created with ctor, each object in the kmem_cache will be initialized before ready to use. While in slub implementation, the first object will be initialized twice. This patch reduces the duplication of initialization of the first object. Fix commit 7656c72b ("SLUB: add macros for scanning objects in a slab"). Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 92d8139c556d..1f1f838326a0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -283,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -1379,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1402,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; -- cgit v1.2.3 From 928cec9cd6db53a68f54bc9ef1c54c674ba1c6bb Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 6 Aug 2014 16:04:44 -0700 Subject: mm: move slab related stuff from util.c to slab_common.c Functions krealloc(), __krealloc(), kzfree() belongs to slab API, so should be placed in slab_common.c Also move slab allocator's tracepoints defenitions to slab_common.c No functional changes here. Signed-off-by: Andrey Ryabinin Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 102 ------------------------------------------------------- 2 files changed, 101 insertions(+), 102 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4bacc6a2..d319502b2403 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include "slab.h" @@ -787,3 +789,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/util.c b/mm/util.c index d5ea733c5082..7b6608df2ee8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -16,9 +16,6 @@ #include "internal.h" -#define CREATE_TRACE_POINTS -#include - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -504,11 +410,3 @@ out_mm: out: return res; } - -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -- cgit v1.2.3 From 8a7d9b4306258e092afaae3c663661d22bf91f5c Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:04:46 -0700 Subject: mm/slab.c: fix comments Current struct kmem_cache has no 'lock' field, and slab page is managed by struct kmem_cache_node, which has 'list_lock' field. Clean up the related comment. Signed-off-by: Wang Sheng-Hui Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 1351725f7936..2e60bf3dedbb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1611,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) } /* - * Interface to system's page allocator. No need to hold the cache-lock. + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. * * If we requested dmaable memory, we will get it. Even if we * did not request dmaable memory, we might get it, but that @@ -1913,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * @cachep: cache pointer being destroyed * @page: page pointer being destroyed * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * cache-lock is not held/needed. + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { -- cgit v1.2.3 From 0aa9a13d80bae1bb24956f6e3e2662b7242e0b41 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Aug 2014 16:04:48 -0700 Subject: mm, slub: fix some indenting in cmpxchg_double_slab() The return statement goes with the cmpxchg_double() condition so it needs to be indented another tab. Also these days the fashion is to line function parameters up, and it looks nicer that way because then the "freelist_new" is not at the same indent level as the "return 1;". Signed-off-by: Dan Carpenter Signed-off-by: Pekka Enberg Signed-off-by: David Rientjes Cc: Joonsoo Kim Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1f1f838326a0..d9aadbfe7c29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -381,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -417,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { -- cgit v1.2.3 From 4307c14f3c77bc0cb0facfc9c67c7872505aaedf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 6 Aug 2014 16:04:51 -0700 Subject: slab: fix the alias count (via sysfs) of slab cache We mark some slab caches (e.g. kmem_cache_node) as unmergeable by setting refcount to -1, and their alias should be 0, not refcount-1, so correct it here. Signed-off-by: Gu Zheng Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index d9aadbfe7c29..9b861b90cde1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4466,7 +4466,7 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); -- cgit v1.2.3 From c42e5715617232563f0cf9f231d86b5133c4487e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:04:53 -0700 Subject: slab: convert last use of __FUNCTION__ to __func__ Just about all of these have been converted to __func__, so convert the last use. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index 928823e17e58..0e0fdd365840 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -256,7 +256,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __FUNCTION__, cachep->name, s->name); + __func__, cachep->name, s->name); WARN_ON_ONCE(1); return s; } -- cgit v1.2.3 From 3e2faa085448d5c478ebc9d5f6cb4d822467f4d7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:55 -0700 Subject: mm/readahead.c: remove unused file_ra_state from count_history_pages count_history_pages does only call page_cache_prev_hole in rcu_lock context using address_space mapping. There's no need to have file_ra_state here. Signed-off-by: Fabian Frederick Acked-by: Fengguang Wu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7770b1..17b9172ec37f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; @@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: -- cgit v1.2.3 From f276540441d255e2f87b37411c4fb75b0eca1606 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:57 -0700 Subject: mm/memory_hotplug.c: add __meminit to grow_zone_span/grow_pgdat_span grow_zone_span and grow_pgdat_span are only called by __meminit __add_zone Signed-off-by: Fabian Frederick Cc: Toshi Kani Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf505f85..3557e8c9e8de 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -427,8 +427,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); -- cgit v1.2.3 From e19318116048d5fbdb8d230d6d37625834b503cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:59 -0700 Subject: mm/page_alloc.c: add __meminit to alloc_pages_exact_nid() alloc_pages_exact_nid() is only called by __meminit alloc_page_cgroup() Signed-off-by: Fabian Frederick Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6eb1fb37de9a..5e7219dc0fae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -360,7 +360,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); /* This is different from alloc_pages_exact_node !!! */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca1..fd4322cc096d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2962,7 +2962,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); -- cgit v1.2.3 From b95b4e1ed92a203f4bdfc55f53d6e9c2773e3b6d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:05:01 -0700 Subject: mm/page_alloc.c: unexport alloc_pages_exact_nid() It is only called by mm/page_cgroup.c whcih cannot be modular. Reported-by: David Rientjes Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd4322cc096d..8c4f1b220ab9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2970,7 +2970,6 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() -- cgit v1.2.3 From 2cfb3665e864755400dc57b6ceee2ebb6b382910 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:05:03 -0700 Subject: include/linux/memblock.h: add __init to memblock_set_bottom_up() memblock_set_bottom_up() is only called by __init cmdline_parse_movable_node() and __init numa_init(). Signed-off-by: Fabian Frederick Reviewed-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b660e05b63d4..e8cc45307f8f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -249,7 +249,7 @@ phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); /* * Set the allocation direction to bottom-up or top-down. */ -static inline void memblock_set_bottom_up(bool enable) +static inline void __init memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } @@ -264,7 +264,7 @@ static inline bool memblock_bottom_up(void) return memblock.bottom_up; } #else -static inline void memblock_set_bottom_up(bool enable) {} +static inline void __init memblock_set_bottom_up(bool enable) {} static inline bool memblock_bottom_up(void) { return false; } #endif -- cgit v1.2.3 From 474750aba88817c53f39424e5567b8e4acc4b39b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:06 -0700 Subject: vmalloc: use rcu list iterator to reduce vmap_area_lock contention Richard Yao reported a month ago that his system have a trouble with vmap_area_lock contention during performance analysis by /proc/meminfo. Andrew asked why his analysis checks /proc/meminfo stressfully, but he didn't answer it. https://lkml.org/lkml/2014/4/10/416 Although I'm not sure that this is right usage or not, there is a solution reducing vmap_area_lock contention with no side-effect. That is just to use rcu list iterator in get_vmalloc_info(). rcu can be used in this function because all RCU protocol is already respected by writers, since Nick Piggin commit db64fe02258f1 ("mm: rewrite vmap layer") back in linux-2.6.28 Specifically : insertions use list_add_rcu(), deletions use list_del_rcu() and kfree_rcu(). Note the rb tree is not used from rcu reader (it would not be safe), only the vmap_area_list has full RCU protection. Note that __purge_vmap_area_lazy() already uses this rcu protection. rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { if (va->va_start < *start) *start = va->va_start; if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; } } rcu_read_unlock(); Peter: : While rcu list traversal over the vmap_area_list is safe, this may : arrive at different results than the spinlocked version. The rcu list : traversal version will not be a 'snapshot' of a single, valid instant : of the entire vmap_area_list, but rather a potential amalgam of : different list states. Joonsoo: : Yes, you are right, but I don't think that we should be strict here. : Meminfo is already not a 'snapshot' at specific time. While we try to get : certain stats, the other stats can change. And, although we may arrive at : different results than the spinlocked version, the difference would not be : large and would not make serious side-effect. [edumazet@google.com: add more commit description] Signed-off-by: Joonsoo Kim Reported-by: Richard Yao Acked-by: Eric Dumazet Cc: Peter Hurley Cc: Zhang Yanfei Cc: Johannes Weiner Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b67196..fdbb116ee669 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2690,14 +2690,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2724,7 +2724,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif -- cgit v1.2.3 From c0d73261f5c1355a35b8b40e871d31578ce0c044 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 6 Aug 2014 16:05:08 -0700 Subject: mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault() Use ACCESS_ONCE() in handle_pte_fault() when getting the entry or orig_pte upon which all subsequent decisions and pte_same() tests will be made. I have no evidence that its lack is responsible for the mm/filemap.c:202 BUG_ON(page_mapped(page)) in __delete_from_page_cache() found by trinity, and I am not optimistic that it will fix it. But I have found no other explanation, and ACCESS_ONCE() here will surely not hurt. If gcc does re-access the pte before passing it down, then that would be disastrous for correct page fault handling, and certainly could explain the page_mapped() BUGs seen (concurrent fault causing page to be mapped in a second time on top of itself: mapcount 2 for a single pte). Signed-off-by: Hugh Dickins Cc: Sasha Levin Cc: Linus Torvalds Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 8b44f765b645..06ff0720d75a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3181,7 +3181,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { -- cgit v1.2.3 From 1f6a6cc82ed305c09385753c80bb7b3bc9eea864 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 6 Aug 2014 16:05:11 -0700 Subject: mem-hotplug: avoid illegal state prefixed with legal state when changing state of memory_block We use the following command to online a memory_block: echo online|online_kernel|online_movable > /sys/devices/system/memory/memoryXXX/state But, if we do the following: echo online_fhsjkghfkd > /sys/devices/system/memory/memoryXXX/state the block will also be onlined. This is because the following code in store_mem_state() does not compare the whole string, but only the prefix of the string. store_mem_state() { ...... 328 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) Here, only compare the first 13 letters of the string. If we give "online_kernelXXXXXX", it will be recognized as online_kernel, which is incorrect. 329 online_type = ONLINE_KERNEL; 330 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) We have the same problem here, 331 online_type = ONLINE_MOVABLE; 332 else if (!strncmp(buf, "online", min_t(int, count, 6))) here, (Here is more problematic. If we give online_movalbe, which is a typo of online_movable, it will be recognized as online without noticing the author.) 333 online_type = ONLINE_KEEP; 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) and here. 335 online_type = -1; 336 else { 337 ret = -EINVAL; 338 goto err; 339 } ...... } This patch fixes this problem by using sysfs_streq() to compare the whole string. Signed-off-by: Tang Chen Reported-by: Hu Tao Cc: Greg Kroah-Hartman Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Gu Zheng Acked-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 89f752dd8465..c6707dfb5284 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -315,13 +315,13 @@ store_mem_state(struct device *dev, if (ret) return ret; - if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) + if (sysfs_streq(buf, "online_kernel")) online_type = ONLINE_KERNEL; - else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) + else if (sysfs_streq(buf, "online_movable")) online_type = ONLINE_MOVABLE; - else if (!strncmp(buf, "online", min_t(int, count, 6))) + else if (sysfs_streq(buf, "online")) online_type = ONLINE_KEEP; - else if (!strncmp(buf, "offline", min_t(int, count, 7))) + else if (sysfs_streq(buf, "offline")) online_type = -1; else { ret = -EINVAL; -- cgit v1.2.3 From 4f7c6b49c45a398d72763d1f0e64ddff8b3653c7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 6 Aug 2014 16:05:13 -0700 Subject: mem-hotplug: introduce MMOP_OFFLINE to replace the hard coding -1 In store_mem_state(), we have: ... 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) 335 online_type = -1; ... 355 case -1: 356 ret = device_offline(&mem->dev); 357 break; ... Here, "offline" is hard coded as -1. This patch does the following renaming: ONLINE_KEEP -> MMOP_ONLINE_KEEP ONLINE_KERNEL -> MMOP_ONLINE_KERNEL ONLINE_MOVABLE -> MMOP_ONLINE_MOVABLE and introduces MMOP_OFFLINE = -1 to avoid hard coding. Signed-off-by: Tang Chen Cc: Hu Tao Cc: Greg Kroah-Hartman Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 18 +++++++++--------- include/linux/memory_hotplug.h | 9 +++++---- mm/memory_hotplug.c | 9 ++++++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c6707dfb5284..7c60ed27e711 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -284,7 +284,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = ONLINE_KEEP; + mem->online_type = MMOP_ONLINE_KEEP; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); @@ -316,22 +316,22 @@ store_mem_state(struct device *dev, return ret; if (sysfs_streq(buf, "online_kernel")) - online_type = ONLINE_KERNEL; + online_type = MMOP_ONLINE_KERNEL; else if (sysfs_streq(buf, "online_movable")) - online_type = ONLINE_MOVABLE; + online_type = MMOP_ONLINE_MOVABLE; else if (sysfs_streq(buf, "online")) - online_type = ONLINE_KEEP; + online_type = MMOP_ONLINE_KEEP; else if (sysfs_streq(buf, "offline")) - online_type = -1; + online_type = MMOP_OFFLINE; else { ret = -EINVAL; goto err; } switch (online_type) { - case ONLINE_KERNEL: - case ONLINE_MOVABLE: - case ONLINE_KEEP: + case MMOP_ONLINE_KERNEL: + case MMOP_ONLINE_MOVABLE: + case MMOP_ONLINE_KEEP: /* * mem->online_type is not protected so there can be a * race here. However, when racing online, the first @@ -342,7 +342,7 @@ store_mem_state(struct device *dev, mem->online_type = online_type; ret = device_online(&mem->dev); break; - case -1: + case MMOP_OFFLINE: ret = device_offline(&mem->dev); break; default: diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010d125bffbf..79dd9eca054f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -26,11 +26,12 @@ enum { MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; -/* Types for control the zone type of onlined memory */ +/* Types for control the zone type of onlined and offlined memory */ enum { - ONLINE_KEEP, - ONLINE_KERNEL, - ONLINE_MOVABLE, + MMOP_OFFLINE = -1, + MMOP_ONLINE_KEEP, + MMOP_ONLINE_KERNEL, + MMOP_ONLINE_MOVABLE, }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3557e8c9e8de..a3797d3fd8a4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone = page_zone(pfn_to_page(pfn)); ret = -EINVAL; - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) goto out; } -- cgit v1.2.3 From 7be12fc9f8dcae7f4c9647d495bd64fc4ffb6836 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 6 Aug 2014 16:05:15 -0700 Subject: mm: page_alloc: simplify drain_zone_pages by using min() Instead of open-coding getting minimal value of two, just use min macro. That is why it is there for. While changing the function also change type of batch local variable to match type of per_cpu_pages::batch (which is int). Signed-off-by: Michal Nazarewicz Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c4f1b220ab9..c1c6cb78e5ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1257,15 +1257,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; -- cgit v1.2.3 From bc7f84c0e67c0ca90b6d0e95cc293ed5d8ad30c4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:05:17 -0700 Subject: mm/internal.h: use nth_page Use nth_page instead of pfn_to_page(page_to_pfn Signed-off-by: Fabian Frederick Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 7f22a11fcc66..a1b651b11c5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } -- cgit v1.2.3 From 3162bbd7e65b9cc57b660796dd3409807bfc9070 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:19 -0700 Subject: DMA, CMA: separate core CMA management codes from DMA APIs To prepare future generalization work on CMA area management code, we need to separate core CMA management codes from DMA APIs. We will extend these core functions to cover requirements of PPC KVM's CMA area management functionality in following patches. This separation helps us not to touch DMA APIs while extending core functions. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Cc: Minchan Kim Cc: Paolo Bonzini Cc: Zhang Yanfei Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 125 ++++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 6467c919c509..9021762227a7 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -213,26 +213,9 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); -/** - * dma_contiguous_reserve_area() - reserve custom contiguous area - * @size: Size of the reserved area (in bytes), - * @base: Base address of the reserved area optional, use 0 for any - * @limit: End address of the reserved memory (optional, 0 for any). - * @res_cma: Pointer to store the created cma region. - * @fixed: hint about where to place the reserved area - * - * This function reserves memory from early allocator. It should be - * called by arch specific code once the early allocator (memblock or bootmem) - * has been activated and all other subsystems have already allocated/reserved - * memory. This function allows to create custom reserved areas for specific - * devices. - * - * If @fixed is true, reserve contiguous area at exactly @base. If false, - * reserve in range from @base to @limit. - */ -int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma, - bool fixed) +static int __init __dma_contiguous_reserve_area(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + struct cma **res_cma, bool fixed) { struct cma *cma = &cma_areas[cma_area_count]; phys_addr_t alignment; @@ -286,15 +269,47 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, (unsigned long)base); - - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(base, size); return 0; + err: pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; } +/** + * dma_contiguous_reserve_area() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas for specific + * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ + int ret; + + ret = __dma_contiguous_reserve_area(size, base, limit, res_cma, fixed); + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(PFN_PHYS((*res_cma)->base_pfn), + (*res_cma)->count << PAGE_SHIFT); + + return 0; +} + static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) { mutex_lock(&cma->lock); @@ -302,31 +317,16 @@ static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) mutex_unlock(&cma->lock); } -/** - * dma_alloc_from_contiguous() - allocate pages from contiguous area - * @dev: Pointer to device for which the allocation is performed. - * @count: Requested number of pages. - * @align: Requested alignment of pages (in PAGE_SIZE order). - * - * This function allocates memory buffer for specified device. It uses - * device specific contiguous memory area if available or the default - * global one. Requires architecture specific dev_get_cma_area() helper - * function. - */ -struct page *dma_alloc_from_contiguous(struct device *dev, int count, +static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, unsigned int align) { unsigned long mask, pfn, pageno, start = 0; - struct cma *cma = dev_get_cma_area(dev); struct page *page = NULL; int ret; if (!cma || !cma->count) return NULL; - if (align > CONFIG_CMA_ALIGNMENT) - align = CONFIG_CMA_ALIGNMENT; - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, count, align); @@ -375,19 +375,30 @@ struct page *dma_alloc_from_contiguous(struct device *dev, int count, } /** - * dma_release_from_contiguous() - release allocated pages - * @dev: Pointer to device for which the pages were allocated. - * @pages: Allocated pages. - * @count: Number of allocated pages. + * dma_alloc_from_contiguous() - allocate pages from contiguous area + * @dev: Pointer to device for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). * - * This function releases memory allocated by dma_alloc_from_contiguous(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. + * This function allocates memory buffer for specified device. It uses + * device specific contiguous memory area if available or the default + * global one. Requires architecture specific dev_get_cma_area() helper + * function. */ -bool dma_release_from_contiguous(struct device *dev, struct page *pages, - int count) +struct page *dma_alloc_from_contiguous(struct device *dev, int count, + unsigned int align) { struct cma *cma = dev_get_cma_area(dev); + + if (align > CONFIG_CMA_ALIGNMENT) + align = CONFIG_CMA_ALIGNMENT; + + return __dma_alloc_from_contiguous(cma, count, align); +} + +static bool __dma_release_from_contiguous(struct cma *cma, struct page *pages, + int count) +{ unsigned long pfn; if (!cma || !pages) @@ -407,3 +418,21 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return true; } + +/** + * dma_release_from_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by dma_alloc_from_contiguous(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + struct cma *cma = dev_get_cma_area(dev); + + return __dma_release_from_contiguous(cma, pages, count); +} -- cgit v1.2.3 From a15bc0b89e8812d0db297bc771a85812c4fa83c1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:21 -0700 Subject: DMA, CMA: support alignment constraint on CMA region PPC KVM's CMA area management needs alignment constraint on CMA region. So support it to prepare generalization of CMA area management functionality. Additionally, add some comments which tell us why alignment constraint is needed on CMA region. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Paolo Bonzini Cc: Zhang Yanfei Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 9021762227a7..5f62c284072c 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -32,6 +32,7 @@ #include #include #include +#include struct cma { unsigned long base_pfn; @@ -215,17 +216,16 @@ core_initcall(cma_init_reserved_areas); static int __init __dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, struct cma **res_cma, bool fixed) { struct cma *cma = &cma_areas[cma_area_count]; - phys_addr_t alignment; int ret = 0; - pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__, - (unsigned long)size, (unsigned long)base, - (unsigned long)limit); + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); - /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); return -ENOSPC; @@ -234,8 +234,17 @@ static int __init __dma_contiguous_reserve_area(phys_addr_t size, if (!size) return -EINVAL; - /* Sanitise input arguments */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); @@ -299,7 +308,8 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, { int ret; - ret = __dma_contiguous_reserve_area(size, base, limit, res_cma, fixed); + ret = __dma_contiguous_reserve_area(size, base, limit, 0, + res_cma, fixed); if (ret) return ret; -- cgit v1.2.3 From e0bdb37d95dd44086159607e571fd70f6b62dc2d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:23 -0700 Subject: DMA, CMA: support arbitrary bitmap granularity PPC KVM's CMA area management requires arbitrary bitmap granularity, since they want to reserve very large memory and manage this region with bitmap that one bit for several pages to reduce management overheads. So support arbitrary bitmap granularity for following generalization. [akpm@linux-foundation.org: s/1/1UL/] Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Zhang Yanfei Acked-by: Minchan Kim Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 77 +++++++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 5f62c284072c..ad8a85bf852f 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -38,6 +38,7 @@ struct cma { unsigned long base_pfn; unsigned long count; unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ struct mutex lock; }; @@ -157,9 +158,37 @@ void __init dma_contiguous_reserve(phys_addr_t limit) static DEFINE_MUTEX(cma_mutex); +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + static int __init cma_activate_area(struct cma *cma) { - int bitmap_size = BITS_TO_LONGS(cma->count) * sizeof(long); + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; unsigned i = cma->count >> pageblock_order; struct zone *zone; @@ -215,9 +244,9 @@ static int __init cma_init_reserved_areas(void) core_initcall(cma_init_reserved_areas); static int __init __dma_contiguous_reserve_area(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, - phys_addr_t alignment, - struct cma **res_cma, bool fixed) + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + struct cma **res_cma, bool fixed) { struct cma *cma = &cma_areas[cma_area_count]; int ret = 0; @@ -249,6 +278,10 @@ static int __init __dma_contiguous_reserve_area(phys_addr_t size, size = ALIGN(size, alignment); limit &= ~(alignment - 1); + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + /* Reserve memory */ if (base && fixed) { if (memblock_is_region_reserved(base, size) || @@ -273,6 +306,7 @@ static int __init __dma_contiguous_reserve_area(phys_addr_t size, */ cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; *res_cma = cma; cma_area_count++; @@ -308,7 +342,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, { int ret; - ret = __dma_contiguous_reserve_area(size, base, limit, 0, + ret = __dma_contiguous_reserve_area(size, base, limit, 0, 0, res_cma, fixed); if (ret) return ret; @@ -320,17 +354,11 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, return 0; } -static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) -{ - mutex_lock(&cma->lock); - bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count); - mutex_unlock(&cma->lock); -} - static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, unsigned int align) { - unsigned long mask, pfn, pageno, start = 0; + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -343,18 +371,19 @@ static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, if (!count) return NULL; - mask = (1 << align) - 1; - + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); for (;;) { mutex_lock(&cma->lock); - pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count, - start, count, mask); - if (pageno >= cma->count) { + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { mutex_unlock(&cma->lock); break; } - bitmap_set(cma->bitmap, pageno, count); + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); /* * It's safe to drop the lock here. We've marked this region for * our exclusive use. If the migration fails we will take the @@ -362,7 +391,7 @@ static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, */ mutex_unlock(&cma->lock); - pfn = cma->base_pfn + pageno; + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); mutex_unlock(&cma_mutex); @@ -370,14 +399,14 @@ static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, page = pfn_to_page(pfn); break; } else if (ret != -EBUSY) { - clear_cma_bitmap(cma, pfn, count); + cma_clear_bitmap(cma, pfn, count); break; } - clear_cma_bitmap(cma, pfn, count); + cma_clear_bitmap(cma, pfn, count); pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); /* try again with a bit different memory target */ - start = pageno + mask + 1; + start = bitmap_no + mask + 1; } pr_debug("%s(): returned %p\n", __func__, page); @@ -424,7 +453,7 @@ static bool __dma_release_from_contiguous(struct cma *cma, struct page *pages, VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); free_contig_range(pfn, count); - clear_cma_bitmap(cma, pfn, count); + cma_clear_bitmap(cma, pfn, count); return true; } -- cgit v1.2.3 From a254129e8686bff7a340b58f35241b04927e81c0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:25 -0700 Subject: CMA: generalize CMA reserved area management functionality Currently, there are two users on CMA functionality, one is the DMA subsystem and the other is the KVM on powerpc. They have their own code to manage CMA reserved area even if they looks really similar. From my guess, it is caused by some needs on bitmap management. KVM side wants to maintain bitmap not for 1 page, but for more size. Eventually it use bitmap where one bit represents 64 pages. When I implement CMA related patches, I should change those two places to apply my change and it seem to be painful to me. I want to change this situation and reduce future code management overhead through this patch. This change could also help developer who want to use CMA in their new feature development, since they can use CMA easily without copying & pasting this reserved area management code. In previous patches, we have prepared some features to generalize CMA reserved area management and now it's time to do it. This patch moves core functions to mm/cma.c and change DMA APIs to use these functions. There is no functional change in DMA APIs. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Zhang Yanfei Acked-by: Minchan Kim Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 1 + drivers/base/Kconfig | 10 -- drivers/base/dma-contiguous.c | 280 +--------------------------------- include/linux/cma.h | 27 ++++ include/linux/dma-contiguous.h | 11 +- mm/Kconfig | 11 ++ mm/Makefile | 1 + mm/cma.c | 333 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 383 insertions(+), 291 deletions(-) create mode 100644 include/linux/cma.h create mode 100644 mm/cma.c diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 1f88db06b133..7a996aaa061e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 88500fed3c7a..4e7f0ff83ae7 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -289,16 +289,6 @@ config CMA_ALIGNMENT If unsure, leave the default value "8". -config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif endmenu diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index ad8a85bf852f..0411c1c57005 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -24,25 +24,9 @@ #include #include -#include -#include -#include #include -#include -#include -#include #include -#include - -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; -}; - -struct cma *dma_contiguous_default_area; +#include #ifdef CONFIG_CMA_SIZE_MBYTES #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES @@ -50,6 +34,8 @@ struct cma *dma_contiguous_default_area; #define CMA_SIZE_MBYTES 0 #endif +struct cma *dma_contiguous_default_area; + /* * Default global CMA area size can be defined in kernel's .config. * This is useful mainly for distro maintainers to create a kernel @@ -156,169 +142,6 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } -static DEFINE_MUTEX(cma_mutex); - -static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) -{ - return (1UL << (align_order >> cma->order_per_bit)) - 1; -} - -static unsigned long cma_bitmap_maxno(struct cma *cma) -{ - return cma->count >> cma->order_per_bit; -} - -static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, - unsigned long pages) -{ - return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; -} - -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) -{ - unsigned long bitmap_no, bitmap_count; - - bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; - bitmap_count = cma_bitmap_pages_to_bits(cma, count); - - mutex_lock(&cma->lock); - bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); - mutex_unlock(&cma->lock); -} - -static int __init cma_activate_area(struct cma *cma) -{ - int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; - struct zone *zone; - - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - - if (!cma->bitmap) - return -ENOMEM; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto err; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - - mutex_init(&cma->lock); - return 0; - -err: - kfree(cma->bitmap); - return -EINVAL; -} - -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; - -static int __init cma_init_reserved_areas(void) -{ - int i; - - for (i = 0; i < cma_area_count; i++) { - int ret = cma_activate_area(&cma_areas[i]); - if (ret) - return ret; - } - - return 0; -} -core_initcall(cma_init_reserved_areas); - -static int __init __dma_contiguous_reserve_area(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, - phys_addr_t alignment, unsigned int order_per_bit, - struct cma **res_cma, bool fixed) -{ - struct cma *cma = &cma_areas[cma_area_count]; - int ret = 0; - - pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", - __func__, (unsigned long)size, (unsigned long)base, - (unsigned long)limit, (unsigned long)alignment); - - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - - if (!size) - return -EINVAL; - - if (alignment && !is_power_of_2(alignment)) - return -EINVAL; - - /* - * Sanitise input arguments. - * Pages both ends in CMA area could be merged into adjacent unmovable - * migratetype page by page allocator's buddy algorithm. In the case, - * you couldn't get a contiguous memory, which is not what we want. - */ - alignment = max(alignment, - (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); - base = ALIGN(base, alignment); - size = ALIGN(size, alignment); - limit &= ~(alignment - 1); - - /* size should be aligned with order_per_bit */ - if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) - return -EINVAL; - - /* Reserve memory */ - if (base && fixed) { - if (memblock_is_region_reserved(base, size) || - memblock_reserve(base, size) < 0) { - ret = -EBUSY; - goto err; - } - } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); - if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; - } - } - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - cma->order_per_bit = order_per_bit; - *res_cma = cma; - cma_area_count++; - - pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); - return 0; - -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return ret; -} - /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), @@ -342,77 +165,17 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, { int ret; - ret = __dma_contiguous_reserve_area(size, base, limit, 0, 0, - res_cma, fixed); + ret = cma_declare_contiguous(size, base, limit, 0, 0, res_cma, fixed); if (ret) return ret; /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(PFN_PHYS((*res_cma)->base_pfn), - (*res_cma)->count << PAGE_SHIFT); + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); return 0; } -static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, - unsigned int align) -{ - unsigned long mask, pfn, start = 0; - unsigned long bitmap_maxno, bitmap_no, bitmap_count; - struct page *page = NULL; - int ret; - - if (!cma || !cma->count) - return NULL; - - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, - count, align); - - if (!count) - return NULL; - - mask = cma_bitmap_aligned_mask(cma, align); - bitmap_maxno = cma_bitmap_maxno(cma); - bitmap_count = cma_bitmap_pages_to_bits(cma, count); - - for (;;) { - mutex_lock(&cma->lock); - bitmap_no = bitmap_find_next_zero_area(cma->bitmap, - bitmap_maxno, start, bitmap_count, mask); - if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); - break; - } - bitmap_set(cma->bitmap, bitmap_no, bitmap_count); - /* - * It's safe to drop the lock here. We've marked this region for - * our exclusive use. If the migration fails we will take the - * lock again and unmark it. - */ - mutex_unlock(&cma->lock); - - pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); - mutex_unlock(&cma_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); - break; - } else if (ret != -EBUSY) { - cma_clear_bitmap(cma, pfn, count); - break; - } - cma_clear_bitmap(cma, pfn, count); - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; - } - - pr_debug("%s(): returned %p\n", __func__, page); - return page; -} - /** * dma_alloc_from_contiguous() - allocate pages from contiguous area * @dev: Pointer to device for which the allocation is performed. @@ -427,35 +190,10 @@ static struct page *__dma_alloc_from_contiguous(struct cma *cma, int count, struct page *dma_alloc_from_contiguous(struct device *dev, int count, unsigned int align) { - struct cma *cma = dev_get_cma_area(dev); - if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return __dma_alloc_from_contiguous(cma, count, align); -} - -static bool __dma_release_from_contiguous(struct cma *cma, struct page *pages, - int count) -{ - unsigned long pfn; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p)\n", __func__, (void *)pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); - - free_contig_range(pfn, count); - cma_clear_bitmap(cma, pfn, count); - - return true; + return cma_alloc(dev_get_cma_area(dev), count, align); } /** @@ -471,7 +209,5 @@ static bool __dma_release_from_contiguous(struct cma *cma, struct page *pages, bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count) { - struct cma *cma = dev_get_cma_area(dev); - - return __dma_release_from_contiguous(cma, pages, count); + return cma_release(dev_get_cma_area(dev), pages, count); } diff --git a/include/linux/cma.h b/include/linux/cma.h new file mode 100644 index 000000000000..f6f7809acb98 --- /dev/null +++ b/include/linux/cma.h @@ -0,0 +1,27 @@ +#ifndef __CMA_H__ +#define __CMA_H__ + +/* + * There is always at least global CMA area and a few optional + * areas configured in kernel .config. + */ +#ifdef CONFIG_CMA_AREAS +#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) + +#else +#define MAX_CMA_AREAS (0) + +#endif + +struct cma; + +extern phys_addr_t cma_get_base(struct cma *cma); +extern unsigned long cma_get_size(struct cma *cma); + +extern int __init cma_declare_contiguous(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + struct cma **res_cma, bool fixed); +extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); +extern bool cma_release(struct cma *cma, struct page *pages, int count); +#endif diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 772eab5d524a..569bbd039896 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -53,18 +53,13 @@ #ifdef __KERNEL__ +#include + struct cma; struct page; -struct device; #ifdef CONFIG_DMA_CMA -/* - * There is always at least global CMA area and a few optional device - * private areas configured in kernel .config. - */ -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) - extern struct cma *dma_contiguous_default_area; static inline struct cma *dev_get_cma_area(struct device *dev) @@ -123,8 +118,6 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #else -#define MAX_CMA_AREAS (0) - static inline struct cma *dev_get_cma_area(struct device *dev) { return NULL; diff --git a/mm/Kconfig b/mm/Kconfig index 3e9977a9d657..f4899ec39cf4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -508,6 +508,17 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 + help + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + config ZBUD tristate default n diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..8338473c329a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -62,3 +62,4 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 000000000000..656004216953 --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,333 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + * Aneesh Kumar K.V + * Joonsoo Kim + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + struct cma **res_cma, bool fixed) +{ + struct cma *cma = &cma_areas[cma_area_count]; + int ret = 0; + + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* Reserve memory */ + if (base && fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + (unsigned long)base); + return 0; + +err: + pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } else if (ret != -EBUSY) { + cma_clear_bitmap(cma, pfn, count); + break; + } + cma_clear_bitmap(cma, pfn, count); + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} -- cgit v1.2.3 From fc95ca7284bc54953165cba76c3228bd2cdb9591 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:28 -0700 Subject: PPC, KVM, CMA: use general CMA reserved area management framework Now, we have general CMA reserved area management framework, so use it for future maintainabilty. There is no functional change. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Paolo Bonzini Tested-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Zhang Yanfei Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/Makefile | 1 - arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +- arch/powerpc/kvm/book3s_hv_builtin.c | 19 ++- arch/powerpc/kvm/book3s_hv_cma.c | 240 ----------------------------------- arch/powerpc/kvm/book3s_hv_cma.h | 27 ---- 5 files changed, 14 insertions(+), 277 deletions(-) delete mode 100644 arch/powerpc/kvm/book3s_hv_cma.c delete mode 100644 arch/powerpc/kvm/book3s_hv_cma.h diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index ce569b6bf4d8..72905c30082e 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -90,7 +90,6 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rm_mmu.o \ book3s_hv_ras.o \ book3s_hv_builtin.o \ - book3s_hv_cma.o \ $(kvm-book3s_64-builtin-xics-objs-y) endif diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 68468d695f12..a01744fc3483 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,8 +37,6 @@ #include #include -#include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 @@ -64,10 +62,10 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } kvm->arch.hpt_cma_alloc = 0; - VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7cde8a665205..3960e0bceaf2 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -16,12 +16,14 @@ #include #include #include +#include #include #include #include -#include "book3s_hv_cma.h" +#define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@ -43,6 +45,8 @@ static unsigned long kvm_cma_resv_ratio = 5; unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages); +static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -97,7 +101,7 @@ struct kvm_rma_info *kvm_alloc_rma() ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@ -112,7 +116,7 @@ EXPORT_SYMBOL_GPL(kvm_alloc_rma); void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@ -131,16 +135,18 @@ struct page *kvm_alloc_hpt(unsigned long nr_pages) { unsigned long align_pages = HPT_ALIGN_PAGES; + VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt); void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt); @@ -179,7 +185,8 @@ void __init kvm_cma_reserve(void) align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(selected_size, 0, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, &kvm_cma, false); } } diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c deleted file mode 100644 index d9d3d8553d51..000000000000 --- a/arch/powerpc/kvm/book3s_hv_cma.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ -#define pr_fmt(fmt) "kvm_cma: " fmt - -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - -#include -#include -#include -#include - -#include "book3s_hv_cma.h" - -struct kvm_cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; -}; - -static DEFINE_MUTEX(kvm_cma_mutex); -static struct kvm_cma kvm_cma_area; - -/** - * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling - * for kvm hash pagetable - * @size: Size of the reserved memory. - * @alignment: Alignment for the contiguous memory area - * - * This function reserves memory for kvm cma area. It should be - * called by arch code when early allocator (memblock or bootmem) - * is still activate. - */ -long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) -{ - long base_pfn; - phys_addr_t addr; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); - - if (!size) - return -EINVAL; - /* - * Sanitise input arguments. - * We should be pageblock aligned for CMA. - */ - alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); - size = ALIGN(size, alignment); - /* - * Reserve memory - * Use __memblock_alloc_base() since - * memblock_alloc_base() panic()s. - */ - addr = __memblock_alloc_base(size, alignment, 0); - if (!addr) { - base_pfn = -ENOMEM; - goto err; - } else - base_pfn = PFN_DOWN(addr); - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = base_pfn; - cma->count = size >> PAGE_SHIFT; - pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); - return 0; -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return base_pfn; -} - -/** - * kvm_alloc_cma() - allocate pages from contiguous area - * @nr_pages: Requested number of pages. - * @align_pages: Requested alignment in number of pages - * - * This function allocates memory buffer for hash pagetable. - */ -struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) -{ - int ret; - struct page *page = NULL; - struct kvm_cma *cma = &kvm_cma_area; - unsigned long chunk_count, nr_chunk; - unsigned long mask, pfn, pageno, start = 0; - - - if (!cma || !cma->count) - return NULL; - - pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, - (void *)cma, nr_pages, align_pages); - - if (!nr_pages) - return NULL; - /* - * align mask with chunk size. The bit tracks pages in chunk size - */ - VM_BUG_ON(!is_power_of_2(align_pages)); - mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; - BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); - - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - for (;;) { - pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, - start, nr_chunk, mask); - if (pageno >= chunk_count) - break; - - pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); - ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); - if (ret == 0) { - bitmap_set(cma->bitmap, pageno, nr_chunk); - page = pfn_to_page(pfn); - memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); - break; - } else if (ret != -EBUSY) { - break; - } - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = pageno + mask + 1; - } - mutex_unlock(&kvm_cma_mutex); - pr_debug("%s(): returned %p\n", __func__, page); - return page; -} - -/** - * kvm_release_cma() - release allocated pages for hash pagetable - * @pages: Allocated pages. - * @nr_pages: Number of allocated pages. - * - * This function releases memory allocated by kvm_alloc_cma(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool kvm_release_cma(struct page *pages, unsigned long nr_pages) -{ - unsigned long pfn; - unsigned long nr_chunk; - struct kvm_cma *cma = &kvm_cma_area; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - bitmap_clear(cma->bitmap, - (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), - nr_chunk); - free_contig_range(pfn, nr_pages); - mutex_unlock(&kvm_cma_mutex); - - return true; -} - -static int __init kvm_cma_activate_area(unsigned long base_pfn, - unsigned long count) -{ - unsigned long pfn = base_pfn; - unsigned i = count >> pageblock_order; - struct zone *zone; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - return -EINVAL; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - return 0; -} - -static int __init kvm_cma_init_reserved_areas(void) -{ - int bitmap_size, ret; - unsigned long chunk_count; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s()\n", __func__); - if (!cma->count) - return 0; - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) - return -ENOMEM; - - ret = kvm_cma_activate_area(cma->base_pfn, cma->count); - if (ret) - goto error; - return 0; - -error: - kfree(cma->bitmap); - return ret; -} -core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h deleted file mode 100644 index 655144f75fa5..000000000000 --- a/arch/powerpc/kvm/book3s_hv_cma.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ - -#ifndef __POWERPC_KVM_CMA_ALLOC_H__ -#define __POWERPC_KVM_CMA_ALLOC_H__ -/* - * Both RMA and Hash page allocation will be multiple of 256K. - */ -#define KVM_CMA_CHUNK_ORDER 18 - -extern struct page *kvm_alloc_cma(unsigned long nr_pages, - unsigned long align_pages); -extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); -extern long kvm_cma_declare_contiguous(phys_addr_t size, - phys_addr_t alignment) __init; -#endif -- cgit v1.2.3 From b7155e76a702d97553660828347b9f10858b4dd5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:30 -0700 Subject: mm, CMA: clean-up CMA allocation error path We can remove one call sites for clear_cma_bitmap() if we first call it before checking error number. Signed-off-by: Joonsoo Kim Acked-by: Minchan Kim Reviewed-by: Michal Nazarewicz Reviewed-by: Zhang Yanfei Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 656004216953..103a6663b7c7 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -285,11 +285,12 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) if (ret == 0) { page = pfn_to_page(pfn); break; - } else if (ret != -EBUSY) { - cma_clear_bitmap(cma, pfn, count); - break; } + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); /* try again with a bit different memory target */ -- cgit v1.2.3 From c1f733aaaf30a0068a3126d5aa9d5b4c25ba4c0c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:32 -0700 Subject: mm, CMA: change cma_declare_contiguous() to obey coding convention Conventionally, we put output param to the end of param list and put the 'base' ahead of 'size', but cma_declare_contiguous() doesn't look like that, so change it. Additionally, move down cma_areas reference code to the position where it is really needed. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Aneesh Kumar K.V Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Paolo Bonzini Cc: Zhang Yanfei Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_builtin.c | 4 ++-- drivers/base/dma-contiguous.c | 2 +- include/linux/cma.h | 2 +- mm/cma.c | 13 +++++++------ 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 3960e0bceaf2..6cf498a9bc98 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -185,8 +185,8 @@ void __init kvm_cma_reserve(void) align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - cma_declare_contiguous(selected_size, 0, 0, align_size, - KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, &kvm_cma, false); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } } diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 0411c1c57005..6606abdf880c 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -165,7 +165,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, { int ret; - ret = cma_declare_contiguous(size, base, limit, 0, 0, res_cma, fixed); + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, res_cma); if (ret) return ret; diff --git a/include/linux/cma.h b/include/linux/cma.h index f6f7809acb98..371b93042520 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -21,7 +21,7 @@ extern unsigned long cma_get_size(struct cma *cma); extern int __init cma_declare_contiguous(phys_addr_t size, phys_addr_t base, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - struct cma **res_cma, bool fixed); + bool fixed, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); extern bool cma_release(struct cma *cma, struct page *pages, int count); #endif diff --git a/mm/cma.c b/mm/cma.c index 103a6663b7c7..488e50810ed1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -141,13 +141,13 @@ core_initcall(cma_init_reserved_areas); /** * cma_declare_contiguous() - reserve custom contiguous area - * @size: Size of the reserved area (in bytes), * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), * @limit: End address of the reserved memory (optional, 0 for any). * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. - * @res_cma: Pointer to store the created cma region. * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) @@ -157,12 +157,12 @@ core_initcall(cma_init_reserved_areas); * If @fixed is true, reserve contiguous area at exactly @base. If false, * reserve in range from @base to @limit. */ -int __init cma_declare_contiguous(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - struct cma **res_cma, bool fixed) + bool fixed, struct cma **res_cma) { - struct cma *cma = &cma_areas[cma_area_count]; + struct cma *cma; int ret = 0; pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", @@ -218,6 +218,7 @@ int __init cma_declare_contiguous(phys_addr_t size, * Each reserved area must be initialised later, when more kernel * subsystems (like slab allocator) are available. */ + cma = &cma_areas[cma_area_count]; cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; -- cgit v1.2.3 From 0de9d2ebe590f9203dac59d4b8e298c473764b92 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 6 Aug 2014 16:05:34 -0700 Subject: mm, CMA: clean-up log message We don't need explicit 'CMA:' prefix, since we already define prefix 'cma:' in pr_fmt. So remove it. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Reviewed-by: Zhang Yanfei Cc: "Aneesh Kumar K.V" Cc: Alexander Graf Cc: Aneesh Kumar K.V Cc: Gleb Natapov Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Cc: Minchan Kim Cc: Paolo Bonzini Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 488e50810ed1..c17751c0dcaf 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -225,12 +225,12 @@ int __init cma_declare_contiguous(phys_addr_t base, *res_cma = cma; cma_area_count++; - pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, (unsigned long)base); return 0; err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; } -- cgit v1.2.3 From f8303c2582b889351e261ff18c4d8eb197a77db2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 16:05:36 -0700 Subject: mm, thp: move invariant bug check out of loop in __split_huge_page_map In __split_huge_page_map(), the check for page_mapcount(page) is invariant within the for loop. Because of the fact that the macro is implemented using atomic_read(), the redundant check cannot be optimized away by the compiler leading to unnecessary read to the page structure. This patch moves the invariant bug check out of the loop so that it will be done only once. On a 3.16-rc1 based kernel, the execution time of a microbenchmark that broke up 1000 transparent huge pages using munmap() had an execution time of 38,245us and 38,548us with and without the patch respectively. The performance gain is about 1%. Signed-off-by: Waiman Long Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Rik van Riel Cc: Scott J Norton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d88fef9..2161490526f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); if (pmd_numa(*pmd)) -- cgit v1.2.3 From 3a79d52aa3c63c939f5a1f86e80e634f84e987c4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 16:05:38 -0700 Subject: mm, thp: replace smp_mb after atomic_add by smp_mb__after_atomic In some architectures like x86, atomic_add() is a full memory barrier. In that case, an additional smp_mb() is just a waste of time. This patch replaces that smp_mb() by smp_mb__after_atomic() which will avoid the redundant memory barrier in some architectures. With a 3.16-rc1 based kernel, this patch reduced the execution time of breaking 1000 transparent huge pages from 38,245us to 30,964us. A reduction of 19% which is quite sizeable. It also reduces the %cpu time of the __split_huge_page_refcount function in the perf profile from 2.18% to 1.15%. Signed-off-by: Waiman Long Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Rik van Riel Cc: Scott J Norton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2161490526f0..4b95ff4120f5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: -- cgit v1.2.3 From 2f3e442ccceb85c51c7dffd3799bfd84de213874 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:40 -0700 Subject: mm: page-flags: clean up the page flag test, set, clear macros - PAGEFLAG_FALSE only defines TEST, make it define SET and CLEAR as well, analogous to PAGEFLAG. - Define TESTSETFLAG_FALSE, analogous to TESTSETFLAG. - Define TESTSCFLAG_FALSE, analogous to TESTSCFLAG - Make PG_mlocked accessors the same on both MMU and !MMU setups Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8304959ad336..e1f5fcd79792 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,13 +171,12 @@ static inline int __TestClearPage##uname(struct page *page) \ #define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) -#define PAGEFLAG_FALSE(uname) \ -static inline int Page##uname(const struct page *page) \ - { return 0; } - #define TESTSCFLAG(uname, lname) \ TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTPAGEFLAG_FALSE(uname) \ +static inline int Page##uname(const struct page *page) { return 0; } + #define SETPAGEFLAG_NOOP(uname) \ static inline void SetPage##uname(struct page *page) { } @@ -187,12 +186,21 @@ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname) \ static inline void __ClearPage##uname(struct page *page) { } +#define TESTSETFLAG_FALSE(uname) \ +static inline int TestSetPage##uname(struct page *page) { return 0; } + #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define __TESTCLEARFLAG_FALSE(uname) \ static inline int __TestClearPage##uname(struct page *page) { return 0; } +#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ + SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) + +#define TESTSCFLAG_FALSE(uname) \ + TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) + struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) @@ -248,7 +256,6 @@ PAGEFLAG_FALSE(HighMem) PAGEFLAG(SwapCache, swapcache) #else PAGEFLAG_FALSE(SwapCache) - SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) @@ -258,8 +265,8 @@ PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) #else -PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) - TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) + TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -- cgit v1.2.3 From 6539cc053869bd32a2db731b215b7c73b11f68d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:42 -0700 Subject: mm: memcontrol: fold mem_cgroup_do_charge() These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 13): This function was split out because mem_cgroup_try_charge() got too big. But having essentially one sequence of operations arbitrarily split in half is not good for reworking the code. Fold it back in. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 166 ++++++++++++++++++++++---------------------------------- 1 file changed, 64 insertions(+), 102 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14918d2..fe3ad310656d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2551,80 +2551,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) -{ - unsigned long csize = nr_pages * PAGE_SIZE; - struct mem_cgroup *mem_over_limit; - struct res_counter *fail_res; - unsigned long flags = 0; - int ret; - - ret = res_counter_charge(&memcg->res, csize, &fail_res); - - if (likely(!ret)) { - if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. - */ - if (nr_pages > min_pages) - return CHARGE_RETRY; - - if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; - - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; - - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; - /* - * Even though the limit is exceeded at this point, reclaim - * may have been able to free some pages. Retry the charge - * before killing the task. - * - * Only for regular pages, though: huge pages are rather - * unlikely to succeed so close to the limit, and we fall back - * to regular pages anyway in case of failure. - */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - - /* - * At task move, charge accounts can be doubly counted. So, it's - * better to wait until the end of task_move if something is going on. - */ - if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - /** * mem_cgroup_try_charge - try charging a memcg * @memcg: memcg to charge @@ -2641,7 +2567,11 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; + struct mem_cgroup *mem_over_limit; + struct res_counter *fail_res; + unsigned long nr_reclaimed; + unsigned long flags = 0; + unsigned long long size; if (mem_cgroup_is_root(memcg)) goto done; @@ -2661,44 +2591,76 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, if (gfp_mask & __GFP_NOFAIL) oom = false; -again: +retry: if (consume_stock(memcg, nr_pages)) goto done; - do { - bool invoke_oom = oom && !nr_oom_retries; + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { + if (!do_swap_account) + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + flags |= MEM_CGROUP_RECLAIM_NOSWAP; + } else + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (!(gfp_mask & __GFP_WAIT)) + goto nomem; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + if (gfp_mask & __GFP_NORETRY) + goto nomem; + + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry; + /* + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. + */ + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + goto retry; + + if (fatal_signal_pending(current)) + goto bypass; + + if (!oom) + goto nomem; + + if (nr_oom_retries--) + goto retry; + + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: return -EINTR; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return 0; } /** -- cgit v1.2.3 From 06b078fc065fe1fe7097675c8ee416aa2ef94fb3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:44 -0700 Subject: mm: memcontrol: rearrange charging fast path The charging path currently starts out with OOM condition checks when OOM is the rarest possible case. Rearrange this code to run OOM/task dying checks only after trying the percpu charge and the res_counter charge and bail out before entering reclaim. Attempting a charge does not hurt an (oom-)killed task as much as every charge attempt having to check OOM conditions. Also, only check __GFP_NOFAIL when the charge would actually fail. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe3ad310656d..f7b6bec9f538 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2575,22 +2575,6 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, if (mem_cgroup_is_root(memcg)) goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; - - if (unlikely(task_in_memcg_oom(current))) - goto nomem; - - if (gfp_mask & __GFP_NOFAIL) - oom = false; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2612,6 +2596,20 @@ retry: goto retry; } + /* + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + if (!(gfp_mask & __GFP_WAIT)) goto nomem; @@ -2640,6 +2638,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (gfp_mask & __GFP_NOFAIL) + goto bypass; + if (fatal_signal_pending(current)) goto bypass; -- cgit v1.2.3 From 28c34c291e746aab1c2bfd6d6609b2e47fa0978b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:47 -0700 Subject: mm: memcontrol: reclaim at least once for __GFP_NORETRY Currently, __GFP_NORETRY tries charging once and gives up before even trying to reclaim. Bring the behavior on par with the page allocator and reclaim at least once before giving up. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7b6bec9f538..a73f3947f5d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2613,13 +2613,13 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; - if (gfp_mask & __GFP_NORETRY) - goto nomem; - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= batch) goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge -- cgit v1.2.3 From d51d885bbb137cc8e1704e76be1846c5e0d5e8b4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:49 -0700 Subject: mm: huge_memory: use GFP_TRANSHUGE when charging huge pages Transparent huge page charges prefer falling back to regular pages rather than spending a lot of time in direct reclaim. Desired reclaim behavior is usually declared in the gfp mask, but THP charges use GFP_KERNEL and then rely on the fact that OOM is disabled for THP charges, and that OOM-disabled charges don't retry reclaim. Needless to say, this is anything but obvious and quite error prone. Convert THP charges to use GFP_TRANSHUGE instead, which implies __GFP_NORETRY, to indicate the low-latency requirement. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b95ff4120f5..24e354c2b59e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1132,7 +1132,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2399,7 +2399,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) return; /* -- cgit v1.2.3 From 9b1306192d335759a6cf2f3b404c49e811e5f953 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:51 -0700 Subject: mm: memcontrol: retry reclaim for oom-disabled and __GFP_NOFAIL charges There is no reason why oom-disabled and __GFP_NOFAIL charges should try to reclaim only once when every other charge tries several times before giving up. Make them all retry the same number of times. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a73f3947f5d9..3069d6420b0e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2566,7 +2566,7 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long nr_reclaimed; @@ -2638,6 +2638,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (nr_retries--) + goto retry; + if (gfp_mask & __GFP_NOFAIL) goto bypass; @@ -2647,9 +2650,6 @@ retry: if (!oom) goto nomem; - if (nr_oom_retries--) - goto retry; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) -- cgit v1.2.3 From 0029e19ebf84dcd70b226820daa7747b28d5956d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Aug 2014 16:05:53 -0700 Subject: mm: memcontrol: remove explicit OOM parameter in charge path For the page allocator, __GFP_NORETRY implies that no OOM should be triggered, whereas memcg has an explicit parameter to disable OOM. The only callsites that want OOM disabled are THP charges and charge moving. THP already uses __GFP_NORETRY and charge moving can use it as well - one full reclaim cycle should be plenty. Switch it over, then remove the OOM parameter. Signed-off-by: Johannes Weiner Signed-off-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3069d6420b0e..8aaca8267dfe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2555,15 +2555,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, * mem_cgroup_try_charge - try charging a memcg * @memcg: memcg to charge * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails * * Returns 0 if @memcg was charged successfully, -EINTR if the charge * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -2647,9 +2645,6 @@ retry: if (fatal_signal_pending(current)) goto bypass; - if (!oom) - goto nomem; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2675,15 +2670,14 @@ done: */ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -2900,8 +2894,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* * mem_cgroup_try_charge() chosed to bypass to root due to @@ -3650,7 +3643,6 @@ int mem_cgroup_charge_anon(struct page *page, { unsigned int nr_pages = 1; struct mem_cgroup *memcg; - bool oom = true; if (mem_cgroup_disabled()) return 0; @@ -3662,14 +3654,9 @@ int mem_cgroup_charge_anon(struct page *page, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, @@ -3706,7 +3693,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); + ret = mem_cgroup_try_charge(memcg, mask, 1); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3733,7 +3720,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, if (!PageSwapCache(page)) { struct mem_cgroup *memcg; - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; *memcgp = memcg; @@ -3802,7 +3789,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); @@ -6440,7 +6427,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = mem_cgroup_try_charge(memcg, + GFP_KERNEL & ~__GFP_NORETRY, 1); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; -- cgit v1.2.3 From 9476db974d9e18885123fcebc09f4596bb922e5f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:55 -0700 Subject: mm: memcontrol: simplify move precharge function The move precharge function does some baroque things: it tries raw res_counter charging of the entire amount first, and then falls back to a loop of one-by-one charges, with checks for pending signals and cond_resched() batching. Just use mem_cgroup_try_charge() without __GFP_WAIT for the first bulk charge attempt. In the one-by-one loop, remove the signal check (this is already checked in try_charge), and simply call cond_resched() after every charge - it's not that expensive. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8aaca8267dfe..8a4159efa3c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6385,56 +6385,38 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; - if (mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(mc.to)) { mc.precharge += count; /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } + + /* Try a single bulk charge without reclaim first */ + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. + */ if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** -- cgit v1.2.3 From 692e7c45d95ad1064b6911800e2cfec7fc0236db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:57 -0700 Subject: mm: memcontrol: catch root bypass in move precharge When mem_cgroup_try_charge() returns -EINTR, it bypassed the charge to the root memcg. But move precharging does not catch this and treats this case as if no charge had happened, thus leaking a charge against root. Because of an old optimization, the root memcg's res_counter is not actually charged right now, but it's still an imbalance and subsequent patches will charge the root memcg again. Catch those bypasses to the root memcg and properly cancel them before giving up the move. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8a4159efa3c0..e0ac636315f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6401,6 +6401,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } + if (ret == -EINTR) { + __mem_cgroup_cancel_charge(root_mem_cgroup, count); + return ret; + } /* Try charges one by one with reclaim */ while (count--) { @@ -6409,8 +6413,11 @@ static int mem_cgroup_do_precharge(unsigned long count) /* * In case of failure, any residual charges against * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. */ + if (ret == -EINTR) + __mem_cgroup_cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; -- cgit v1.2.3 From 05b8430123359886ef6a4146fba384e30d771b3f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:05:59 -0700 Subject: mm: memcontrol: use root_mem_cgroup res_counter Due to an old optimization to keep expensive res_counter changes at a minimum, the root_mem_cgroup res_counter is never charged; there is no limit at that level anyway, and any statistics can be generated on demand by summing up the counters of all other cgroups. However, with per-cpu charge caches, res_counter operations do not even show up in profiles anymore, so this optimization is no longer necessary. Remove it to simplify the code. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 152 ++++++++++++++++---------------------------------------- 1 file changed, 44 insertions(+), 108 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0ac636315f8..07908ea954b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2570,9 +2570,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, unsigned long nr_reclaimed; unsigned long flags = 0; unsigned long long size; + int ret = 0; - if (mem_cgroup_is_root(memcg)) - goto done; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2650,13 +2649,15 @@ nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry; done_restock: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: - return 0; + return ret; } /** @@ -2695,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2713,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -3943,7 +3939,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, * replacement page, so leave it alone when phasing out the * page that is unused after the migration. */ - if (!end_migration && !mem_cgroup_is_root(memcg)) + if (!end_migration) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -4076,8 +4072,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. This memcg can * be obsolete one. We avoid calling css_tryget_online(). */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -4767,78 +4762,24 @@ out: return retval; } - -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) -{ - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; -} - -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5300,7 +5241,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* * current_threshold points to threshold just below or equal to usage. @@ -5396,15 +5340,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5484,18 +5428,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6249,9 +6194,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6387,13 +6332,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) /* Handlers for move charge at task migration. */ static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - - if (mem_cgroup_is_root(mc.to)) { - mc.precharge += count; - /* we don't need css_get for root */ - return ret; - } + int ret; /* Try a single bulk charge without reclaim first */ ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); @@ -6700,21 +6639,18 @@ static void __mem_cgroup_clear_mc(void) /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } -- cgit v1.2.3 From 9a2385eef9f28fb5260c48c45fc8fe01f1da70a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:01 -0700 Subject: mm: memcontrol: remove ordering between pc->mem_cgroup and PageCgroupUsed There is a write barrier between setting pc->mem_cgroup and PageCgroupUsed, which was added to allow LRU operations to lookup the memcg LRU list of a page without acquiring the page_cgroup lock. But ever since commit 38c5d72f3ebe ("memcg: simplify LRU handling by new rule"), pages are ensured to be off-LRU while charging, so nobody else is changing LRU state while pc->mem_cgroup is being written, and there are no read barriers anymore. Remove the unnecessary write barrier. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 07908ea954b6..c31bc40a5827 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2795,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } pc->mem_cgroup = memcg; - /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. - */ - smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { @@ -3483,7 +3475,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], -- cgit v1.2.3 From a840cda63e543d41270698525542a82b7a8a18d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:04 -0700 Subject: mm: memcontrol: do not acquire page_cgroup lock for kmem pages Kmem page charging and uncharging is serialized by means of exclusive access to the page. Do not take the page_cgroup lock and don't set pc->flags atomically. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: Hugh Dickins Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c31bc40a5827..a6a062e409eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3407,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3422,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it -- cgit v1.2.3 From b69deb2b7e13f04da5c0684c7ce19e788736ab0d Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 6 Aug 2014 16:06:06 -0700 Subject: mm/mem-hotplug: replace simple_strtoull() with kstrtoull() Use the newer and more pleasant kstrtoull() to replace simple_strtoull(), because simple_strtoull() is marked for obsoletion. Signed-off-by: Zhang Zhen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7c60ed27e711..a2e13e250bba 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -406,7 +406,9 @@ memory_probe_store(struct device *dev, struct device_attribute *attr, int i, ret; unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; - phys_addr = simple_strtoull(buf, NULL, 0); + ret = kstrtoull(buf, 0, &phys_addr); + if (ret) + return ret; if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) return -EINVAL; -- cgit v1.2.3 From 1a4dc5bc7cb5659a8004d105afeb0571126f8f56 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:06:08 -0700 Subject: mem-hotplug: improve zone_movable_is_highmem logic In original code, zone_movable_is_highmem() assumes ZONE_MOVABLE not highmem if CONFIG_HAVE_MEMBLOCK_NODE_MAP is not set. In online_pages, it extracts pages from the previous zone before ZONE_MOVABLE. Which is logically inconsistent: If HAVE_MEMBLOCK_NODE_MAP is turned off but HIGHMEM is on, zone_movable_is_highmem() makes movable zone not highmem, but online_pages() extracts pages from ZONE_HIGHMEM. This inconsistency doesn't cause real problem currently, because all architectures support online_pages also have HAVE_MEMBLOCK_NODE_MAP. However, fixing it makes code clear, and also helps futher coding. Signed-off-by: Wang Nan Cc: Zhang Zhen Cc: Mel Gorman Cc: Jiang Liu Cc: Li Zefan Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6cbd1b6c3d20..559e659288fc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -872,6 +872,8 @@ static inline int zone_movable_is_highmem(void) { #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) return movable_zone == ZONE_HIGHMEM; +#elif defined(CONFIG_HIGHMEM) + return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #else return 0; #endif -- cgit v1.2.3 From 8d07429319b2836604061f48f7e3dfe78acc060c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:10 -0700 Subject: mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable shrink_zones() has a special branch to skip the all_unreclaimable() check during hibernation, because a frozen kswapd can't mark a zone unreclaimable. But ever since commit 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages() livelock"), determining a zone to be unreclaimable is done by directly looking at its scan history and no longer relies on kswapd setting the per-zone flag. Remove this branch and let shrink_zones() check the reclaimability of the target zones regardless of hibernation state. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe8eb67..19b5b8016209 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2534,14 +2534,6 @@ out: if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ if (aborted_reclaim) return 1; -- cgit v1.2.3 From 0b06496a338e83627dc5f0d25323e7a1ae9cb87d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:12 -0700 Subject: mm: vmscan: rework compaction-ready signaling in direct reclaim Page reclaim for a higher-order page runs until compaction is ready, then aborts and signals this situation through the return value of shrink_zones(). This is an oddly specific signal to encode in the return value of shrink_zones(), though, and can be quite confusing. Introduce sc->compaction_ready and signal the compactability of the zones out-of-band to free up the return value of shrink_zones() for actual zone reclaimability. Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 70 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 19b5b8016209..6f43df4a5253 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,6 +65,9 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* One of the zones is ready for compaction */ + int compaction_ready; + /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until @@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) */ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; - - return aborted_reclaim; } /* All zones in zonelist are unreclaimable? */ @@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; delayacct_freepages_start(); @@ -2500,11 +2492,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2526,16 +2521,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; /* top priority shrink_zones still had more to do? don't OOM, then */ -- cgit v1.2.3 From 2344d7e44b870f9df67e505ee4e633217de752ba Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:15 -0700 Subject: mm: vmscan: remove all_unreclaimable() Direct reclaim currently calls shrink_zones() to reclaim all members of a zonelist, and if that wasn't successful it does another pass through the same zonelist to check overall reclaimability. Just check reclaimability in shrink_zones() directly and propagate the result through the return value. Then remove all_unreclaimable(). Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f43df4a5253..74a9e0ae09b0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2290,8 +2291,13 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ @@ -2340,8 +2346,10 @@ static inline bool compaction_ready(struct zone *zone, int order) * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * Returns true if a zone was reclaimable. */ -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2354,6 +2362,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2414,10 +2423,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2439,26 +2455,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2482,6 +2480,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2492,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2532,8 +2531,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; -- cgit v1.2.3 From 02695175c79b9163c798cc1cb78c628d011c07a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:17 -0700 Subject: mm: vmscan: move swappiness out of scan_control Swappiness is determined for each scanned memcg individually in shrink_zone() and is not a parameter that applies throughout the reclaim scan. Move it out of struct scan_control to prevent accidental use of a stale value. Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 74a9e0ae09b0..c28b8981e56a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -89,9 +89,6 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; - /* anon vs. file LRUs scanning "ratio" */ - int swappiness; - /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -1868,8 +1865,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1912,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !sc->swappiness) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1922,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && sc->swappiness) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1965,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -2055,7 +2052,8 @@ out: /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2066,7 +2064,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2263,11 +2261,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2714,10 +2713,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_swap = !noswap, .order = 0, .priority = 0, - .swappiness = mem_cgroup_swappiness(memcg), .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2733,7 +2732,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit v1.2.3 From ee814fe23daf08abd3ea6c6b1f900f4f25b524d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:06:19 -0700 Subject: mm: vmscan: clean up struct scan_control Reorder the members by input and output, then turn the individual integers for may_writepage, may_unmap, may_swap, compaction_ready, hibernation_mode into bit fields to save stack space: +72/-296 -224 kswapd 104 176 +72 try_to_free_pages 80 56 -24 try_to_free_mem_cgroup_pages 80 56 -24 shrink_all_memory 88 64 -24 reclaim_clean_pages_from_list 168 144 -24 mem_cgroup_shrink_node_zone 104 80 -24 __zone_reclaim 176 152 -24 balance_pgdat 152 - -152 Signed-off-by: Johannes Weiner Suggested-by: Mel Gorman Acked-by: Mel Gorman Acked-by: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 99 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c28b8981e56a..81dd858b9d17 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,35 +59,20 @@ #include struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - /* One of the zones is ready for compaction */ - int compaction_ready; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -95,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -2668,15 +2669,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, }; /* @@ -2706,14 +2706,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); int swappiness = mem_cgroup_swappiness(memcg); @@ -2748,16 +2745,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, }; /* @@ -3015,12 +3010,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3401,14 +3395,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3588,13 +3581,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, -- cgit v1.2.3 From 4a262d265878477a46c842e47d1c2b5ce6f66477 Mon Sep 17 00:00:00 2001 From: Max Asbock Date: Wed, 6 Aug 2014 16:06:21 -0700 Subject: mm tracing: tell mm_migrate_pages event about numa_misplaced The mm_migrate_pages trace event reports a reason for the migration, typically as a symbolic string. The exception is the reason MR_NUMA_MISPLACED for which it just displays the numeric value: mm_migrate_pages: nr_succeeded=1 nr_failed=0 mode=MIGRATE_ASYNC reason=0x5 This patch makes the output consistent by introducing a string value for MR_NUMA_MISPLACED. The event is then reported as: mm_migrate_pages: nr_succeeded=1 nr_failed=0 mode=MIGRATE_ASYNC reason=numa_misplaced Signed-off-by: Max Asbock Acked-by: Steven Rostedt Cc: Ingo Molnar Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/migrate.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4e4f2f8b1ac2..dd2b5467d905 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -17,6 +17,7 @@ {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ {MR_SYSCALL, "syscall_or_cpuset"}, \ {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ + {MR_NUMA_MISPLACED, "numa_misplaced"}, \ {MR_CMA, "cma"} TRACE_EVENT(mm_migrate_pages, -- cgit v1.2.3 From 54980b93c026ac24b7d5046597a254244eafcdeb Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:06:23 -0700 Subject: mm: update the description for madvise_remove Currently, we have more filesystems supporting fallocate, e.g ext4/btrfs. Remove the outdated comment for madvise_remove. Signed-off-by: Wang Sheng-Hui Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index a402f8fdc68e..0938b30da4ab 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, -- cgit v1.2.3 From 660654f90e7f8f6d8163276d47fc1573a39c7007 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Aug 2014 16:06:25 -0700 Subject: mm/vmalloc.c: add a schedule point to vmalloc() It is not uncommon on busy servers to get stuck hundred of ms in vmalloc() calls (like file descriptor expansions). Add a cond_resched() to __vmalloc_area_node() to be gentle to other tasks. [akpm@linux-foundation.org: only do it for __GFP_WAIT, per David] Signed-off-by: Eric Dumazet Cc: Hugh Dickins Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fdbb116ee669..a3cad905f560 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1602,6 +1602,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } if (map_vm_area(area, prot, &pages)) -- cgit v1.2.3 From 930f036b4ff6501b91e09bba4bf94423203dabd9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:28 -0700 Subject: mm, vmalloc: constify allocation mask tmp_mask in the __vmalloc_area_node() iteration never changes so it can be moved into function scope and marked with const. This causes the movl and orl to only be done once per call rather than area->nr_pages times. nested_gfp can also be marked const. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3cad905f560..9ec4173f48a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1566,7 +1566,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1589,12 +1590,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ -- cgit v1.2.3 From ef6b571fb8920d5006349a6e29ac47c4817e9691 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:06:30 -0700 Subject: include/linux/mmdebug.h: add VM_WARN_ONCE() It was missing... Cc: Konstantin Khlebnikov Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index edd82a105220..2f348d02f640 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -20,11 +20,13 @@ extern void dump_page_badflags(struct page *page, const char *reason, } while (0) #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) +#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_DEBUG_VIRTUAL -- cgit v1.2.3 From 66ee4b8887ec5ce04bae3e840d206db7b7ad34d1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:32 -0700 Subject: shmem: fix double uncharge in __shmem_file_setup() If __shmem_file_setup() fails on struct file allocation it uncharges memory commitment twice: first by shmem_unacct_size() and second time implicitly in shmem_evict_inode() when it kills the newly created inode. This patch removes shmem_unacct_size() from error path if the inode was already there. Signed-off-by: Konstantin Khlebnikov Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index af68b15a8fc1..3609d31ad0dd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2932,16 +2932,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2949,19 +2949,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } -- cgit v1.2.3 From 77142517990fd3d982678c2945ea2c4188ec5f9a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:34 -0700 Subject: shmem: update memory reservation on truncate A shared anonymous mapping created without MAP_NORESERVE holds memory reservation for whole range of shmem segment. Usually there is no way to change its size, but /proc//map_files/... (available if CONFIG_CHECKPOINT_RESTORE=y) allows that. This patch adjusts the memory reservation in shmem_setattr(). Signed-off-by: Konstantin Khlebnikov Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 3609d31ad0dd..57fd82a5af7a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t newsize = attr->ia_size; if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } -- cgit v1.2.3 From 82f71ae4a2b829a25971bdf54b4d0d3d69d3c8b7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Aug 2014 16:06:36 -0700 Subject: mm: catch memory commitment underflow Print a warning (if CONFIG_DEBUG_VM=y) when memory commitment becomes too negative. This shouldn't happen any more - the previous two patches fixed the committed_as underflow issues. [akpm@linux-foundation.org: use VM_WARN_ONCE, per Dave] Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index 129b847d30cc..64c9d736155c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* -- cgit v1.2.3 From cc7452b6dca384400960d40090a98d0eb920ab22 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Wed, 6 Aug 2014 16:06:38 -0700 Subject: mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces Historically, we exported shared pages to userspace via sysinfo(2) sharedram and /proc/meminfo's "MemShared" fields. With the advent of tmpfs, from kernel v2.4 onward, that old way for accounting shared mem was deemed inaccurate and we started to export a hard-coded 0 for sysinfo.sharedram. Later on, during the 2.6 timeframe, "MemShared" got re-introduced to /proc/meminfo re-branded as "Shmem", but we're still reporting sysinfo.sharedmem as that old hard-coded zero, which makes the "shared memory" report inconsistent across interfaces. This patch leverages the addition of explicit accounting for pages used by shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat" -- in order to make the users of sysinfo(2) and si_meminfo*() friends aware of that vmstat entry and make them report it consistently across the interfaces, as well to make sysinfo(2) returned data consistent with our current API documentation states. Signed-off-by: Rafael Aquini Acked-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 +- fs/proc/meminfo.c | 2 +- mm/page_alloc.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 8f7ed9933a7c..c6d3ae05f1ca 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), - nid, K(node_page_state(nid, NR_SHMEM)), + nid, K(i.sharedram), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0b1aa3..aa1eee06420f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1c6cb78e5ca..0987ac9f0a4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3047,7 +3047,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3067,6 +3067,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.3 From c2ea2181db43ced2e5945b9596bb3bb9935ce92e Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:06:41 -0700 Subject: mm/hwpoison-inject.c: remove unnecessary null test before debugfs_remove_recursive Fix checkpatch warning: "WARNING: debugfs_remove_recursive(NULL) is safe this check is probably not required" Signed-off-by: Fabian Frederick Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c71cad5..329caf56df22 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) -- cgit v1.2.3 From eb39d618f9e80f81cfc5788cf1b252d141c2f0c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 6 Aug 2014 16:06:43 -0700 Subject: mm: replace init_page_accessed by __SetPageReferenced Do we really need an exported alias for __SetPageReferenced()? Its callers better know what they're doing, in which case the page would not be already marked referenced. Kill init_page_accessed(), just __SetPageReferenced() inline. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dave Hansen Cc: Prabhakar Lad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - mm/filemap.c | 4 ++-- mm/shmem.c | 2 +- mm/swap.c | 14 +++----------- 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bdbee80eede..1eb64043c076 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,7 +311,6 @@ extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); -extern void init_page_accessed(struct page *page); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); diff --git a/mm/filemap.c b/mm/filemap.c index 65d44fd88c78..7e85c8147e1b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1091,9 +1091,9 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { diff --git a/mm/shmem.c b/mm/shmem.c index 57fd82a5af7a..fe15d96c3166 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,7 +1166,7 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page); error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); diff --git a/mm/swap.c b/mm/swap.c index 9e8e3472248b..d8eb4d09ffa2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Used to mark_page_accessed(page) that is not visible yet and when it is - * still safe to use non-atomic ops - */ -void init_page_accessed(struct page *page) -{ - if (!PageReferenced(page)) - __SetPageReferenced(page); -} -EXPORT_SYMBOL(init_page_accessed); - static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); -- cgit v1.2.3 From 2f4612af43d4854c892f5ef8ed7a98b6492aee44 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Aug 2014 16:06:45 -0700 Subject: mm,hugetlb: make unmap_ref_private() return void This function always returns 1, thus no need to check return value in hugetlb_cow(). By doing so, we can get rid of the unnecessary WARN_ON call. While this logic perhaps existed as a way of identifying future unmap_ref_private() mishandling, reality is it serves no apparent purpose. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0a73d2fcff..b94752ae791b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2754,8 +2754,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2794,8 +2794,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2857,20 +2855,18 @@ retry_avoidcopy: */ if (outside_reserve) { BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } /* Caller expects lock to be held */ -- cgit v1.2.3 From ad4404a226ea92f2966f0e5378614e15ff4a7c76 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Aug 2014 16:06:47 -0700 Subject: mm,hugetlb: simplify error handling in hugetlb_cow() When returning from hugetlb_cow(), we always (1) put back the refcount for each referenced page -- always 'old', and 'new' if allocation was successful. And (2) retake the page table lock right before returning, as the callers expects. This logic can be simplified and encapsulated, as proposed in this patch. In addition to cleaner code, we also shave a few bytes off the instruction text: text data bss dec hex filename 28399 462 41328 70189 1122d mm/hugetlb.o-baseline 28367 462 41328 70157 1120d mm/hugetlb.o-patched Passes libhugetlbfs testcases. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b94752ae791b..e84d22ce5de8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2808,7 +2808,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2838,14 +2838,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2854,6 +2854,7 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); @@ -2869,12 +2870,9 @@ retry_avoidcopy: return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2882,11 +2880,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2896,6 +2891,7 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered @@ -2916,12 +2912,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ -- cgit v1.2.3 From f37d4298aa7f8b74395aa13c728677e2ed86fdaf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 Aug 2014 16:06:49 -0700 Subject: hwpoison: fix race with changing page during offlining When a hwpoison page is locked it could change state due to parallel modifications. The original compound page can be torn down and then this 4k page becomes part of a differently-size compound page is is a standalone regular page. Check after the lock if the page is still the same compound page. We could go back, grab the new head page and try again but it should be quite rare, so I thought this was safest. A retry loop would be more difficult to test and may have more side effects. The hwpoison code by design only tries to handle cases that are reasonably common in workloads, as visible in page-flags. I'm not really that concerned about handling this (likely rare case), just not crashing on it. Signed-off-by: Andi Kleen Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a013bc94ebbe..44c6bd201d3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1172,6 +1172,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); + /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } + /* * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One -- cgit v1.2.3 From 238d3c13f0cce38752072dc90f4e828abdfec143 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:51 -0700 Subject: mm, hugetlb: generalize writes to nr_hugepages Three different interfaces alter the maximum number of hugepages for an hstate: - /proc/sys/vm/nr_hugepages for global number of hugepages of the default hstate, - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages for global number of hugepages for a specific hstate, and - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages/mempolicy for number of hugepages for a specific hstate over the set of allowed nodes. Generalize the code so that a single function handles all of these writes instead of duplicating the code in two different functions. This decreases the number of lines of code, but also reduces the size of .text by about half a percent since set_max_huge_pages() can be inlined. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e84d22ce5de8..7a0fcb33973e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; @@ -1784,6 +1776,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -2248,36 +2257,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -ENOTSUPP; - tmp = h->max_huge_pages; - - if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) - return -EINVAL; - table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } -- cgit v1.2.3 From ed4d4902ebdd7ca8b5a51daaf6bebf4b172895cc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:54 -0700 Subject: mm, hugetlb: remove hugetlb_zero and hugetlb_infinity They are unnecessary: "zero" can be used in place of "hugetlb_zero" and passing extra2 == NULL is equivalent to infinity. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 - kernel/sysctl.c | 9 +++------ mm/hugetlb.c | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a23c096b3080..6e6d338641fe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif extern unsigned long hugepages_treat_as_movable; -extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0fcb33973e..d9ad93b55585 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; -- cgit v1.2.3 From 21bda264f4243f61dfcc485174055f12ad0530b4 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:06:56 -0700 Subject: mm: make copy_pte_range static again Commit 71e3aac0724f ("thp: transparent hugepage core") adds copy_pte_range prototype to huge_mm.h. I'm not sure why (or if) this function have been used outside of memory.c, but it currently isn't. This patch makes copy_pte_range() static again. Signed-off-by: Jerome Marchand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 4 ---- mm/memory.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b826239bdce0..63579cb8d3dc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -93,10 +93,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #endif /* CONFIG_DEBUG_VM */ extern unsigned long transparent_hugepage_flags; -extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end); extern int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { diff --git a/mm/memory.c b/mm/memory.c index 06ff0720d75a..01d0289f30a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -884,7 +884,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { -- cgit v1.2.3 From f6f8ed47353597dcb895eb4a15a28af657392e72 Mon Sep 17 00:00:00 2001 From: WANG Chao Date: Wed, 6 Aug 2014 16:06:58 -0700 Subject: mm/vmalloc.c: clean up map_vm_area third argument Currently map_vm_area() takes (struct page *** pages) as third argument, and after mapping, it moves (*pages) to point to (*pages + nr_mappped_pages). It looks like this kind of increment is useless to its caller these days. The callers don't care about the increments and actually they're trying to avoid this by passing another copy to map_vm_area(). The caller can always guarantee all the pages can be mapped into vm_area as specified in first argument and the caller only cares about whether map_vm_area() fails or not. This patch cleans up the pointer movement in map_vm_area() and updates its callers accordingly. Signed-off-by: WANG Chao Cc: Zhang Yanfei Acked-by: Greg Kroah-Hartman Cc: Minchan Kim Cc: Nitin Gupta Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/kernel/module.c | 2 +- drivers/lguest/core.c | 7 ++----- drivers/staging/android/binder.c | 4 +--- include/linux/vmalloc.h | 2 +- mm/vmalloc.c | 14 +++++--------- mm/zsmalloc.c | 2 +- 6 files changed, 11 insertions(+), 20 deletions(-) diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 4918d91bc3a6..d19b13e3a59f 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) area->nr_pages = npages; area->pages = pages; - if (map_vm_area(area, prot_rwx, &pages)) { + if (map_vm_area(area, prot_rwx, pages)) { vunmap(area->addr); goto error; } diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 0bf1e4edf04d..6590558d1d31 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -42,7 +42,6 @@ DEFINE_MUTEX(lguest_lock); static __init int map_switcher(void) { int i, err; - struct page **pagep; /* * Map the Switcher in to high memory. @@ -110,11 +109,9 @@ static __init int map_switcher(void) * This code actually sets up the pages we've allocated to appear at * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our - * array of struct pages. It increments that pointer, but we don't - * care. + * array of struct pages. */ - pagep = lg_switcher_pages; - err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 02b0379ae550..4f34dc0095b5 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -585,7 +585,6 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - struct page **page_array_ptr; page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; @@ -598,8 +597,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, } tmp_area.addr = page_addr; tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - page_array_ptr = page; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", proc->pid, page_addr); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4b8a89189a29..b87696fdf06a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -113,7 +113,7 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page ***pages); + struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9ec4173f48a8..2b0aa5486092 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) } EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1606,7 +1602,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189624cf..bb62a4adc328 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -690,7 +690,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } -- cgit v1.2.3 From 49c8b24d00b6cb06a9c3fb959a957319cc770d71 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 6 Aug 2014 16:07:00 -0700 Subject: drivers/firmware/memmap.c: pass the correct argument to firmware_map_find_entry_bootmem() firmware_map_add_hotplug() calls firmware_map_find_entry_bootmem() to get free firmware_map_entry. But end arguments is not correct. So firmware_map_find_entry_bootmem() cannot not find firmware_map_entry. The patch passes the correct end argument to firmware_map_find_entry_bootmem(). Signed-off-by: Yasuaki Ishimatsu Cc: Santosh Shilimkar Cc: Toshi Kani Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 17cf96c45f2b..1815849f83cb 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -286,7 +286,7 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = firmware_map_find_entry_bootmem(start, end, type); + entry = firmware_map_find_entry_bootmem(start, end - 1, type); if (!entry) { entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); if (!entry) -- cgit v1.2.3 From f0093ede9b726ccb1876d43574f5b45c79940aca Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 6 Aug 2014 16:07:03 -0700 Subject: drivers/firmware/memmap.c: don't allocate firmware_map_entry of same memory range When limiting memory by mem= and ACPI DSDT table has PNP0C80, firmware_map_entrys of same memory range are allocated and memmap X sysfses which have same memory range are created as follows: # cat /sys/firmware/memmap/0/* 0x407ffffffff 0x40000000000 System RAM # cat /sys/firmware/memmap/33/* 0x407ffffffff 0x40000000000 System RAM # cat /sys/firmware/memmap/35/* 0x407ffffffff 0x40000000000 System RAM In this case, when hot-removing memory, kernel panic occurs, showing following call trace: BUG: unable to handle kernel paging request at 00000001003e000b IP: sysfs_open_file+0x46/0x2b0 PGD 203a89fe067 PUD 0 Oops: 0000 [#1] SMP ... Call Trace: do_dentry_open+0x1ef/0x2a0 finish_open+0x31/0x40 do_last+0x57c/0x1220 path_openat+0xc2/0x4c0 do_filp_open+0x4b/0xb0 do_sys_open+0xf3/0x1f0 SyS_open+0x1e/0x20 system_call_fastpath+0x16/0x1b The problem occurs as follows: When calling e820_reserve_resources(), firmware_map_entrys of all e820 memory map are allocated. And all firmware_map_entrys is added map_entries list as follows: map_entries -> +--- entry A --------+ -> ... | start 0x407ffffffff| | end 0x40000000000| | type System RAM | +--------------------+ After that, if ACPI DSDT table has PNP0C80 and the memory range is limited by mem=, the PNP0C80 is hot-added. Then firmware_map_entry of PNP0C80 is allocated and added map_entries list as follows: map_entries -> +--- entry A --------+ -> ... -> +--- entry B --------+ | start 0x407ffffffff| | start 0x407ffffffff| | end 0x40000000000| | end 0x40000000000| | type System RAM | | type System RAM | +--------------------+ +--------------------+ Then memmap 0 sysfs for entry B is created. After that, firmware_memmap_init() creates memmap sysfses of all firmware_map_entrys in map_entries list. As a result, memmap 33 sysfs for entry A and memmap 35 sysfs for entry B are created. But kobject of entry B has been used by memmap 0 sysfs. So when creating memmap 35 sysfs, the kobject is broken. If hot-removing memory, memmap 0 sysfs is destroyed and kobject of memmap 0 sysfs is freed. But the kobject can be accessed via memmap 35 sysfs. So when open memmap 35 sysfs, kernel panic occurs. This patch checks whether there is firmware_map_entry of same memory range in map_entries list and don't allocate firmware_map_entry of same memroy range. Signed-off-by: Yasuaki Ishimatsu Cc: Santosh Shilimkar Cc: Toshi Kani Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/memmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 1815849f83cb..79f18e6d9c4f 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -286,6 +286,10 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; + entry = firmware_map_find_entry(start, end - 1, type); + if (entry) + return 0; + entry = firmware_map_find_entry_bootmem(start, end - 1, type); if (!entry) { entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); -- cgit v1.2.3 From 9aed8614af5a05cdaa32a0b78b0f1a424754a958 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 6 Aug 2014 16:07:05 -0700 Subject: mm/memory.c: don't forget to set softdirty on file mapped fault Otherwise we may not notice that pte was softdirty because pte_mksoft_dirty helper _returns_ new pte but doesn't modify the argument. In case if page fault happend on dirty filemapping the newly created pte may loose softdirty bit thus if a userspace program is tracking memory changes with help of a memory tracker (CONFIG_MEM_SOFT_DIRTY) it might miss modification of a memory page (which in worts case may lead to data inconsistency). Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 01d0289f30a7..7e131325bdf8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2744,7 +2744,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - pte_mksoft_dirty(entry); + entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); -- cgit v1.2.3 From d0480be44a90af81a425f426c9107fb8f0899f65 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:07:07 -0700 Subject: mm: update the description for vm_total_pages vm_total_pages is calculated by nr_free_pagecache_pages(), which counts the number of pages which are beyond the high watermark within all zones. So vm_total_pages is not equal to total number of pages which the VM controls. Signed-off-by: Wang Sheng-Hui Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 81dd858b9d17..5fec1ba9951f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -137,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -- cgit v1.2.3 From 2c51856c9bea9b20e4e5aab001971a5a6624db17 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 6 Aug 2014 16:07:09 -0700 Subject: mm: trace-vmscan-postprocess.pl: report the number of file/anon pages respectively Until now, the reporting from trace-vmscan-postprocess.pl is not very useful because we cannot directly use this script for checking the file/anon ratio of scanning. This patch aims to report respectively the number of file/anon pages which were scanned/reclaimed by kswapd or direct-reclaim. Sample output is usually something like the following. Summary Direct reclaims: 8823 Direct reclaim pages scanned: 2438797 Direct reclaim file pages scanned: 1315200 Direct reclaim anon pages scanned: 1123597 Direct reclaim pages reclaimed: 446139 Direct reclaim file pages reclaimed: 378668 Direct reclaim anon pages reclaimed: 67471 Direct reclaim write file sync I/O: 0 Direct reclaim write anon sync I/O: 0 Direct reclaim write file async I/O: 0 Direct reclaim write anon async I/O: 4240 Wake kswapd requests: 122310 Time stalled direct reclaim: 13.78 seconds Kswapd wakeups: 25817 Kswapd pages scanned: 170779115 Kswapd file pages scanned: 162725123 Kswapd anon pages scanned: 8053992 Kswapd pages reclaimed: 129065738 Kswapd file pages reclaimed: 128500930 Kswapd anon pages reclaimed: 564808 Kswapd reclaim write file sync I/O: 0 Kswapd reclaim write anon sync I/O: 0 Kswapd reclaim write file async I/O: 36 Kswapd reclaim write anon async I/O: 730730 Time kswapd awake: 1015.50 seconds Signed-off-by: Chen Yucong Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 78c9a7b2b58f..8f961ef2b457 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -47,6 +47,10 @@ use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; use constant HIGH_NR_RECLAIMED => 24; +use constant HIGH_NR_FILE_SCANNED => 25; +use constant HIGH_NR_ANON_SCANNED => 26; +use constant HIGH_NR_FILE_RECLAIMED => 27; +use constant HIGH_NR_ANON_RECLAIMED => 28; my %perprocesspid; my %perprocess; @@ -56,14 +60,18 @@ my $opt_read_procstat; my $total_wakeup_kswapd; my ($total_direct_reclaim, $total_direct_nr_scanned); +my ($total_direct_nr_file_scanned, $total_direct_nr_anon_scanned); my ($total_direct_latency, $total_kswapd_latency); my ($total_direct_nr_reclaimed); +my ($total_direct_nr_file_reclaimed, $total_direct_nr_anon_reclaimed); my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async); my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async); my ($total_kswapd_nr_scanned, $total_kswapd_wake); +my ($total_kswapd_nr_file_scanned, $total_kswapd_nr_anon_scanned); my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async); my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async); my ($total_kswapd_nr_reclaimed); +my ($total_kswapd_nr_file_reclaimed, $total_kswapd_nr_anon_reclaimed); # Catch sigint and exit on request my $sigint_report = 0; @@ -374,6 +382,7 @@ EVENT_PROCESS: } my $isolate_mode = $1; my $nr_scanned = $4; + my $file = $6; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -382,6 +391,11 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; + if ($file == 1) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; + } } } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { $details = $6; @@ -391,8 +405,19 @@ EVENT_PROCESS: print " $regex_lru_shrink_inactive/o\n"; next; } + my $nr_reclaimed = $4; + my $flags = $6; + my $file = 0; + if ($flags =~ /RECLAIM_WB_FILE/) { + $file = 1; + } $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed; + if ($file) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED} += $nr_reclaimed; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED} += $nr_reclaimed; + } } elsif ($tracepoint eq "mm_vmscan_writepage") { $details = $6; if ($details !~ /$regex_writepage/o) { @@ -493,7 +518,11 @@ sub dump_stats { $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}; $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_direct_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_direct_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_direct_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_direct_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -513,7 +542,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}, $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}, $this_reclaim_delay / 1000); @@ -552,7 +585,11 @@ sub dump_stats { $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}; $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_kswapd_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_kswapd_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_kswapd_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_kswapd_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -563,7 +600,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}, $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}); @@ -594,7 +635,11 @@ sub dump_stats { print "\nSummary\n"; print "Direct reclaims: $total_direct_reclaim\n"; print "Direct reclaim pages scanned: $total_direct_nr_scanned\n"; + print "Direct reclaim file pages scanned: $total_direct_nr_file_scanned\n"; + print "Direct reclaim anon pages scanned: $total_direct_nr_anon_scanned\n"; print "Direct reclaim pages reclaimed: $total_direct_nr_reclaimed\n"; + print "Direct reclaim file pages reclaimed: $total_direct_nr_file_reclaimed\n"; + print "Direct reclaim anon pages reclaimed: $total_direct_nr_anon_reclaimed\n"; print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n"; print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n"; print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n"; @@ -604,7 +649,11 @@ sub dump_stats { print "\n"; print "Kswapd wakeups: $total_kswapd_wake\n"; print "Kswapd pages scanned: $total_kswapd_nr_scanned\n"; + print "Kswapd file pages scanned: $total_kswapd_nr_file_scanned\n"; + print "Kswapd anon pages scanned: $total_kswapd_nr_anon_scanned\n"; print "Kswapd pages reclaimed: $total_kswapd_nr_reclaimed\n"; + print "Kswapd file pages reclaimed: $total_kswapd_nr_file_reclaimed\n"; + print "Kswapd anon pages reclaimed: $total_kswapd_nr_anon_reclaimed\n"; print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n"; print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n"; print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n"; @@ -629,7 +678,11 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}; $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED}; + $perprocess{$process}->{HIGH_NR_FILE_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $perprocess{$process}->{HIGH_NR_ANON_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED}; $perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_FILE_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_ANON_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; -- cgit v1.2.3 From 24b7e5819ad5cbef2b7c7376510862aa8319d240 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:11 -0700 Subject: mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated This was formerly the series "Improve sequential read throughput" which noted some major differences in performance of tiobench since 3.0. While there are a number of factors, two that dominated were the introduction of the fair zone allocation policy and changes to CFQ. The behaviour of fair zone allocation policy makes more sense than tiobench as a benchmark and CFQ defaults were not changed due to insufficient benchmarking. This series is what's left. It's one functional fix to the fair zone allocation policy when used on NUMA machines and a reduction of overhead in general. tiobench was used for the comparison despite its flaws as an IO benchmark as in this case we are primarily interested in the overhead of page allocator and page reclaim activity. On UMA, it makes little difference to overhead 3.16.0-rc3 3.16.0-rc3 vanilla lowercost-v5 User 383.61 386.77 System 403.83 401.74 Elapsed 5411.50 5413.11 On a 4-socket NUMA machine it's a bit more noticable 3.16.0-rc3 3.16.0-rc3 vanilla lowercost-v5 User 746.94 802.00 System 65336.22 40852.33 Elapsed 27553.52 27368.46 This patch (of 6): The LRU insertion and activate tracepoints take PFN as a parameter forcing the overhead to the caller. Move the overhead to the tracepoint fast-assign method to ensure the cost is only incurred when the tracepoint is active. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/pagemap.h | 16 +++++++--------- mm/swap.c | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1c9fabde69e4..ce0803b8d05f 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, TP_PROTO( struct page *page, - unsigned long pfn, - int lru, - unsigned long flags + int lru ), - TP_ARGS(page, pfn, lru, flags), + TP_ARGS(page, lru), TP_STRUCT__entry( __field(struct page *, page ) @@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); __entry->lru = lru; - __entry->flags = flags; + __entry->flags = trace_pagemap_flags(page); ), /* Flag format is based on page-types.c formatting for pagemap */ @@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, TRACE_EVENT(mm_lru_activate, - TP_PROTO(struct page *page, unsigned long pfn), + TP_PROTO(struct page *page), - TP_ARGS(page, pfn), + TP_ARGS(page), TP_STRUCT__entry( __field(struct page *, page ) @@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); ), /* Flag format is based on page-types.c formatting for pagemap */ diff --git a/mm/swap.c b/mm/swap.c index d8eb4d09ffa2..c789d01c9ec3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -988,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* -- cgit v1.2.3 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:14 -0700 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines The arrangement of struct zone has changed over time and now it has reached the point where there is some inappropriate sharing going on. On x86-64 for example o The zone->node field is shared with the zone lock and zone->node is accessed frequently from the page allocator due to the fair zone allocation policy. o span_seqlock is almost never used by shares a line with free_area o Some zone statistics share a cache line with the LRU lock so reclaim-intensive and allocator-intensive workloads can bounce the cache line on a stat update This patch rearranges struct zone to put read-only and read-mostly fields together and then splits the page allocator intensive fields, the zone statistics and the page reclaim intensive fields into their own cache lines. Note that the type of lowmem_reserve changes due to the watermark calculations being signed and avoiding a signed/unsigned conversion there. On the test configuration I used the overall size of struct zone shrunk by one cache line. On smaller machines, this is not likely to be noticable. However, on a 4-node NUMA machine running tiobench the system CPU overhead is reduced by this patch. 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5r9 User 746.94 759.78 System 65336.22 58350.98 Elapsed 27553.52 27282.02 Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 211 +++++++++++++++++++++++++------------------------ mm/page_alloc.c | 7 +- mm/vmstat.c | 4 +- 3 files changed, 113 insertions(+), 109 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 559e659288fc..ed0876bb902c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -324,18 +324,11 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H struct zone { - /* Fields commonly accessed by the page allocator */ + /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; - /* - * When free pages are below this point, additional steps are taken - * when reading the number of free pages to avoid per-cpu counter - * drift allowing watermarks to be breached - */ - unsigned long percpu_drift_mark; - /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -344,41 +337,26 @@ struct zone { * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ - unsigned long lowmem_reserve[MAX_NR_ZONES]; - - /* - * This is a per-zone reserve of pages that should not be - * considered dirtyable memory. - */ - unsigned long dirty_balance_reserve; + long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; +#endif + /* - * zone reclaim becomes active if more unmapped pages exist. + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this zone's LRU. Maintained by the pageout code. */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif + unsigned int inactive_ratio; + + struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; + /* - * free areas of different sizes + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. */ - spinlock_t lock; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* Set to true when the PG_migrate_skip bits should be cleared */ - bool compact_blockskip_flush; - - /* pfn where compaction free scanner should start */ - unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; -#endif -#ifdef CONFIG_MEMORY_HOTPLUG - /* see spanned/present_pages for more description */ - seqlock_t span_seqlock; -#endif - struct free_area free_area[MAX_ORDER]; + unsigned long dirty_balance_reserve; #ifndef CONFIG_SPARSEMEM /* @@ -388,74 +366,14 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_COMPACTION - /* - * On compaction failure, 1<> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -500,9 +418,11 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ + unsigned long managed_pages; unsigned long spanned_pages; unsigned long present_pages; - unsigned long managed_pages; + + const char *name; /* * Number of MIGRATE_RESEVE page block. To maintain for just @@ -510,10 +430,95 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif + /* - * rarely used fields: + * wait_table -- the array holding the hash table + * wait_table_hash_nr_entries -- the size of the hash table array + * wait_table_bits -- wait_table_size == (1 << wait_table_bits) + * + * The purpose of all these is to keep track of the people + * waiting for a page to become available and make them + * runnable again when possible. The trouble is that this + * consumes a lot of space, especially when so few things + * wait on pages at a given time. So instead of using + * per-page waitqueues, we use a waitqueue hash table. + * + * The bucket discipline is to sleep on the same queue when + * colliding and wake all in that wait queue when removing. + * When something wakes, it must check to be sure its page is + * truly available, a la thundering herd. The cost of a + * collision is great, but given the expected load of the + * table, they should be so rare as to be outweighed by the + * benefits from the saved space. + * + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the + * primary users of these fields, and in mm/page_alloc.c + * free_area_init_core() performs the initialization of them. */ - const char *name; + wait_queue_head_t *wait_table; + unsigned long wait_table_hash_nr_entries; + unsigned long wait_table_bits; + + ZONE_PADDING(_pad1_) + + /* Write-intensive fields used from the page allocator */ + spinlock_t lock; + + /* free areas of different sizes */ + struct free_area free_area[MAX_ORDER]; + + /* zone flags, see below */ + unsigned long flags; + + ZONE_PADDING(_pad2_) + + /* Write-intensive fields used by page reclaim */ + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; + unsigned long pages_scanned; /* since last reclaim */ + struct lruvec lruvec; + + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + + /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where compaction free scanner should start */ + unsigned long compact_cached_free_pfn; + /* pfn where async and sync compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[2]; +#endif + +#ifdef CONFIG_COMPACTION + /* + * On compaction failure, 1<lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1723,7 +1722,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -3254,7 +3253,7 @@ void show_free_areas(unsigned int filter) ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -5575,7 +5574,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49bfd55..8267f77d1875 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1077,10 +1077,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); -- cgit v1.2.3 From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:16 -0700 Subject: mm: move zone->pages_scanned into a vmstat counter zone->pages_scanned is a write-intensive cache line during page reclaim and it's also updated during page free. Move the counter into vmstat to take advantage of the per-cpu updates and do not update it in the free paths unless necessary. On a small UMA machine running tiobench the difference is marginal. On a 4-node machine the overhead is more noticable. Note that automatic NUMA balancing was disabled for this test as otherwise the system CPU overhead is unpredictable. 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5 vmstat-v5 User 746.94 759.78 774.56 System 65336.22 58350.98 32847.27 Elapsed 27553.52 27282.02 27415.04 Note that the overhead reduction will vary depending on where exactly pages are allocated and freed. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 12 +++++++++--- mm/vmscan.c | 7 ++++--- mm/vmstat.c | 3 ++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed0876bb902c..0bd77f730b38 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -143,6 +143,7 @@ enum zone_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -480,7 +481,6 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - unsigned long pages_scanned; /* since last reclaim */ struct lruvec lruvec; /* Evictions & activations on the inactive file list */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7381d11f021..daa016063793 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) @@ -3248,7 +3254,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fec1ba9951f..9c8222b499b4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -174,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -1508,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1698,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8267f77d1875..e574e883fa70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -763,6 +763,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); -- cgit v1.2.3 From bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:18 -0700 Subject: mm: vmscan: only update per-cpu thresholds for online CPU When kswapd is awake reclaiming, the per-cpu stat thresholds are lowered to get more accurate counts to avoid breaching watermarks. This threshold update iterates over all possible CPUs which is unnecessary. Only online CPUs need to be updated. If a new CPU is onlined, refresh_zone_stat_thresholds() will set the thresholds correctly. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index e574e883fa70..e9ab104b956f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } -- cgit v1.2.3 From f7b5d647946aae1647bf5cd26c16b3a793c1ac49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:20 -0700 Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered The purpose of numa_zonelist_order=zone is to preserve lower zones for use with 32-bit devices. If locality is preferred then the numa_zonelist_order=node policy should be used. Unfortunately, the fair zone allocation policy overrides this by skipping zones on remote nodes until the lower one is found. While this makes sense from a page aging and performance perspective, it breaks the expected zonelist policy. This patch restores the expected behaviour for zone-list ordering. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index daa016063793..6e5e8f762532 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1965,7 +1965,7 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) - continue; + break; if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; } -- cgit v1.2.3 From 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 16:07:22 -0700 Subject: mm: page_alloc: reduce cost of the fair zone allocation policy The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++ mm/page_alloc.c | 101 ++++++++++++++++++++++++++----------------------- 2 files changed, 59 insertions(+), 48 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bd77f730b38..318df7051850 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -534,6 +534,7 @@ typedef enum { ZONE_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -571,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); } +static inline int zone_is_fair_depleted(const struct zone *zone) +{ + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); +} + static inline int zone_is_oom_locked(const struct zone *zone) { return test_bit(ZONE_OOM_LOCKED, &zone->flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e5e8f762532..fb9908148474 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1612,6 +1612,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1966,8 +1985,10 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; + } } /* * When allocating a page cache page for writing, we @@ -2073,13 +2094,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2088,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2767,28 +2789,11 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { - /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit v1.2.3 From 9a95f3cf7b33d66fa64727cff8cd2f2a9d09f335 Mon Sep 17 00:00:00 2001 From: Paul Cassella Date: Wed, 6 Aug 2014 16:07:24 -0700 Subject: mm: describe mmap_sem rules for __lock_page_or_retry() and callers Add a comment describing the circumstances in which __lock_page_or_retry() will or will not release the mmap_sem when returning 0. Add comments to lock_page_or_retry()'s callers (filemap_fault(), do_swap_page()) noting the impact on VM_FAULT_RETRY returns. Add comments on up the call tree, particularly replacing the false "We return with mmap_sem still held" comments. Signed-off-by: Paul Cassella Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 3 ++- include/linux/pagemap.h | 3 +++ mm/filemap.c | 23 +++++++++++++++++++++++ mm/gup.c | 18 +++++++++++++++--- mm/memory.c | 34 +++++++++++++++++++++++++++++++--- mm/mlock.c | 9 ++++++++- 6 files changed, 82 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1dbade870f90..a24194681513 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1218,7 +1218,8 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1474ae18c88..3df8c7db7a4e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -484,6 +484,9 @@ static inline int lock_page_killable(struct page *page) /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. + * + * Return value and mmap_sem implications depend on flags; see + * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) diff --git a/mm/filemap.c b/mm/filemap.c index 7e85c8147e1b..af19a6b079f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { diff --git a/mm/gup.c b/mm/gup.c index cc5a9e7adea7..91d044b1600d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -258,6 +258,11 @@ unmap: return ret; } +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { @@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages); * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This should be called with the mm_sem held for read. + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) diff --git a/mm/memory.c b/mm/memory.c index 7e131325bdf8..4d0a543f3bb3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -2688,6 +2691,11 @@ oom: return VM_FAULT_OOM; } +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, struct page **page) { @@ -3016,6 +3024,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3040,7 +3054,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3172,7 +3188,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3232,6 +3251,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3313,6 +3335,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { diff --git a/mm/mlock.c b/mm/mlock.c index b1eb53634005..ce84cb0b83ef 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) -- cgit v1.2.3 From fed400a181447ba975d40e1df5e0d555eae51795 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 6 Aug 2014 16:07:26 -0700 Subject: mm/shmem.c: remove the unused gfp arg to shmem_add_to_page_cache() The gfp arg is not used in shmem_add_to_page_cache. Remove this unused arg. Signed-off-by: Wang Sheng-Hui Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index fe15d96c3166..302d1cf7ad07 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -293,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -666,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1112,7 +1112,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1175,7 +1175,7 @@ repeat: error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { -- cgit v1.2.3 From 14a4e2141e24304fff2c697be6382ffb83888185 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:29 -0700 Subject: mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target node") improved the previous khugepaged logic which allocated a transparent hugepages from the node of the first page being collapsed. However, it is still possible to collapse pages to remote memory which may suffer from additional access latency. With the current policy, it is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed remotely if the majority are allocated from that node. When zone_reclaim_mode is enabled, it means the VM should make every attempt to allocate locally to prevent NUMA performance degradation. In this case, we do not want to collapse hugepages to remote nodes that would suffer from increased access latency. Thus, when zone_reclaim_mode is enabled, only allow collapsing to nodes with RECLAIM_DISTANCE or less. There is no functional change for systems that disable zone_reclaim_mode. Signed-off-by: David Rientjes Cc: Dave Hansen Cc: Andrea Arcangeli Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Rik van Riel Cc: "Kirill A. Shutemov" Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 24e354c2b59e..3630d577e987 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void) static int khugepaged_node_load[MAX_NUMNODES]; +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA static int khugepaged_find_target_node(void) { @@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) -- cgit v1.2.3 From 9ef0a0ffa28edbf5c7cfa6be73b4ecb9896a3875 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:31 -0700 Subject: mm, writeback: prevent race when calculating dirty limits Setting vm_dirty_bytes and dirty_background_bytes is not protected by any serialization. Therefore, it's possible for either variable to change value after the test in global_dirty_limits() to determine whether available_memory needs to be initialized or not. Always ensure that available_memory is properly initialized. Signed-off-by: David Rientjes Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0c943014eb7..91d73ef1744d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { + const unsigned long available_memory = global_dirtyable_memory(); unsigned long background; unsigned long dirty; - unsigned long uninitialized_var(available_memory); struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); - if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else -- cgit v1.2.3 From aee52cae00ba0d426a827b761920a476a08eca9e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 6 Aug 2014 16:07:33 -0700 Subject: slub: remove kmemcg id from create_unique_id This function is never called for memcg caches, because they are unmergeable, so remove the dead code. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 9b861b90cde1..3e8afcc07a76 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5128,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", - memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } -- cgit v1.2.3 From 6326440077a48d2c3b2993f3b3f2d969f09b6917 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:36 -0700 Subject: memory-hotplug: add zone_for_memory() for selecting zone for new memory This series of patches fixes a problem when adding memory in bad manner. For example: for a x86_64 machine booted with "mem=400M" and with 2GiB memory installed, following commands cause problem: # echo 0x40000000 > /sys/devices/system/memory/probe [ 28.613895] init_memory_mapping: [mem 0x40000000-0x47ffffff] # echo 0x48000000 > /sys/devices/system/memory/probe [ 28.693675] init_memory_mapping: [mem 0x48000000-0x4fffffff] # echo online_movable > /sys/devices/system/memory/memory9/state # echo 0x50000000 > /sys/devices/system/memory/probe [ 29.084090] init_memory_mapping: [mem 0x50000000-0x57ffffff] # echo 0x58000000 > /sys/devices/system/memory/probe [ 29.151880] init_memory_mapping: [mem 0x58000000-0x5fffffff] # echo online_movable > /sys/devices/system/memory/memory11/state # echo online> /sys/devices/system/memory/memory8/state # echo online> /sys/devices/system/memory/memory10/state # echo offline> /sys/devices/system/memory/memory9/state [ 30.558819] Offlined Pages 32768 # free total used free shared buffers cached Mem: 780588 18014398509432020 830552 0 0 51180 -/+ buffers/cache: 18014398509380840 881732 Swap: 0 0 0 This is because the above commands probe higher memory after online a section with online_movable, which causes ZONE_HIGHMEM (or ZONE_NORMAL for systems without ZONE_HIGHMEM) overlaps ZONE_MOVABLE. After the second online_movable, the problem can be observed from zoneinfo: # cat /proc/zoneinfo ... Node 0, zone Movable pages free 65491 min 250 low 312 high 375 scanned 0 spanned 18446744073709518848 present 65536 managed 65536 ... This series of patches solve the problem by checking ZONE_MOVABLE when choosing zone for new memory. If new memory is inside or higher than ZONE_MOVABLE, makes it go there instead. After applying this series of patches, following are free and zoneinfo result (after offlining memory9): bash-4.2# free total used free shared buffers cached Mem: 780956 80112 700844 0 0 51180 -/+ buffers/cache: 28932 752024 Swap: 0 0 0 bash-4.2# cat /proc/zoneinfo Node 0, zone DMA pages free 3389 min 14 low 17 high 21 scanned 0 spanned 4095 present 3998 managed 3977 nr_free_pages 3389 ... start_pfn: 1 inactive_ratio: 1 Node 0, zone DMA32 pages free 73724 min 341 low 426 high 511 scanned 0 spanned 98304 present 98304 managed 92958 nr_free_pages 73724 ... start_pfn: 4096 inactive_ratio: 1 Node 0, zone Normal pages free 32630 min 120 low 150 high 180 scanned 0 spanned 32768 present 32768 managed 32768 nr_free_pages 32630 ... start_pfn: 262144 inactive_ratio: 1 Node 0, zone Movable pages free 65476 min 241 low 301 high 361 scanned 0 spanned 98304 present 65536 managed 65536 nr_free_pages 65476 ... start_pfn: 294912 inactive_ratio: 1 This patch (of 7): Introduce zone_for_memory() in arch independent code for arch_add_memory() use. Many arch_add_memory() function simply selects ZONE_HIGHMEM or ZONE_NORMAL and add new memory into it. However, with the existance of ZONE_MOVABLE, the selection method should be carefully considered: if new, higher memory is added after ZONE_MOVABLE is setup, the default zone and ZONE_MOVABLE may overlap each other. should_add_memory_movable() checks the status of ZONE_MOVABLE. If it has already contain memory, compare the address of new memory and movable memory. If new memory is higher than movable, it should be added into ZONE_MOVABLE instead of default zone. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + mm/memory_hotplug.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 79dd9eca054f..d9524c49d767 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -259,6 +259,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a3797d3fd8a4..2ff8c2325e96 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1159,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { -- cgit v1.2.3 From 9bfc41138525c51955cd35cbca56f8cd757a73f8 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:38 -0700 Subject: memory-hotplug: x86_64: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on x86_64 to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df1a9927ad29..5621c47d7a1a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; -- cgit v1.2.3 From 03d4be64603e2dc1bfa624bc436869d73340723e Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:40 -0700 Subject: memory-hotplug: x86_32: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on x86_32 to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e39504878aec..7d05565ba781 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,7 +825,8 @@ void __init mem_init(void) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + struct zone *zone = pgdata->node_zones + + zone_for_memory(nid, start, size, ZONE_HIGHMEM); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; -- cgit v1.2.3 From ed562ae6d76dbe409dddedefc94a54673623dcb5 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:43 -0700 Subject: memory-hotplug: ia64: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on ia64 to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 25c350264a41..892d43e32f3b 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -631,7 +631,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); - zone = pgdat->node_zones + ZONE_NORMAL; + zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) -- cgit v1.2.3 From f51202de8ffd9dc8ca76ea81a916349138e234ca Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:45 -0700 Subject: memory-hotplug: ppc: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on powerpc to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2c8e90f5789e..e0f7a189c48e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,7 +128,8 @@ int arch_add_memory(int nid, u64 start, u64 size) return -EINVAL; /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones; + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0); return __add_pages(nid, zone, start_pfn, nr_pages); } -- cgit v1.2.3 From 6e90b58be7545c2cb2306fa186ccf5888c76c559 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:47 -0700 Subject: memory-hotplug: sh: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on sh to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2d089fe2cba9..2790b6a64157 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -495,8 +495,9 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + ZONE_NORMAL, - start_pfn, nr_pages); + ret = __add_pages(nid, pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL), + start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); -- cgit v1.2.3 From 8d060bf490930f305c4efc45724e861a268f4d2f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:50 -0700 Subject: mm, oom: ensure memoryless node zonelist always includes zones With memoryless node support being worked on, it's possible that for optimizations that a node may not have a non-NULL zonelist. When CONFIG_NUMA is enabled and node 0 is memoryless, this means the zonelist for first_online_node may become NULL. The oom killer requires a zonelist that includes all memory zones for the sysrq trigger and pagefault out of memory handler. Ensure that a non-NULL zonelist is always passed to the oom killer. [akpm@linux-foundation.org: fix non-numa build] Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 2 +- include/linux/nodemask.h | 11 ++++++++++- mm/oom_kill.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 454b65898e2c..42bad18c66c9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,7 +355,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_online_node, GFP_KERNEL), GFP_KERNEL, + out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, 0, NULL, true); } diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 58b9a02c38d2..83a6aeda899d 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -430,7 +430,15 @@ static inline int num_node_state(enum node_states state) for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) -#define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) +#define first_memory_node first_node(node_states[N_MEMORY]) +static inline int next_online_node(int nid) +{ + return next_node(nid, node_states[N_ONLINE]); +} +static inline int next_memory_node(int nid) +{ + return next_node(nid, node_states[N_MEMORY]); +} extern int nr_node_ids; extern int nr_online_nodes; @@ -471,6 +479,7 @@ static inline int num_node_state(enum node_states state) for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 +#define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 #define nr_online_nodes 1 diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82d4352..b0a1e1ff0353 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -694,7 +694,7 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; - zonelist = node_zonelist(first_online_node, GFP_KERNEL); + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); clear_zonelist_oom(zonelist, GFP_KERNEL); -- cgit v1.2.3 From e972a070e2d3296cd2e2cc2fd0561ce89a1d5ebf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:52 -0700 Subject: mm, oom: rename zonelist locking functions try_set_zonelist_oom() and clear_zonelist_oom() are not named properly to imply that they require locking semantics to avoid out_of_memory() being reordered. zone_scan_lock is required for both functions to ensure that there is proper locking synchronization. Rename try_set_zonelist_oom() to oom_zonelist_trylock() and rename clear_zonelist_oom() to oom_zonelist_unlock() to imply there is proper locking semantics. At the same time, convert oom_zonelist_trylock() to return bool instead of int since only success and failure are tested. Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 4 ++-- mm/oom_kill.c | 30 +++++++++++++----------------- mm/page_alloc.c | 6 +++--- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 4cd62677feb9..647395a1a550 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,8 +55,8 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct mem_cgroup *memcg, nodemask_t *nodemask, const char *message); -extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); +extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, int order, const nodemask_t *nodemask); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0a1e1ff0353..d33aca1552ad 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,28 +559,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) if (zone_is_oom_locked(zone)) { - ret = 0; + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_set_flag(zone, ZONE_OOM_LOCKED); - } out: spin_unlock(&zone_scan_lock); @@ -592,15 +589,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_clear_flag(zone, ZONE_OOM_LOCKED); - } spin_unlock(&zone_scan_lock); } @@ -695,8 +691,8 @@ void pagefault_out_of_memory(void) return; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb9908148474..578236089ec1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2246,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2285,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } -- cgit v1.2.3 From 8fe780484d2674eec27e12bb29c07d3e98a7ad21 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:54 -0700 Subject: mm, thp: restructure thp avoidance of light synchronous migration __GFP_NO_KSWAPD, once the way to determine if an allocation was for thp or not, has gained more users. Their use is not necessarily wrong, they are trying to do a memory allocation that can easily fail without disturbing kswapd, so the bit has gained additional usecases. This restructures the check to determine whether MIGRATE_SYNC_LIGHT should be used for memory compaction in the page allocator. Rather than testing solely for __GFP_NO_KSWAPD, test for all bits that must be set for thp allocations. This also moves the check to be done only after the page allocator is aborted for deferred or contended memory compaction since setting migration_mode for this case is pointless. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 578236089ec1..18cee0d4c8a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2638,14 +2638,6 @@ rebalance: if (page) goto got_pg; - /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; - /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller @@ -2656,6 +2648,15 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.3 From d0177639310d23c7739500df3c6ce6fdfe34acec Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 6 Aug 2014 16:07:56 -0700 Subject: mm: fix potential infinite loop in dissolve_free_huge_pages() It is possible for some platforms, such as powerpc to set HPAGE_SHIFT to 0 to indicate huge pages not supported. When this is the case, hugetlbfs could be disabled during boot time: hugetlbfs: disabling because there are no supported hugepage sizes Then in dissolve_free_huge_pages(), order is kept maximum (64 for 64bits), and the for loop below won't end: for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) As suggested by Naoya, below fix checks hugepages_supported() before calling dissolve_free_huge_pages(). [rientjes@google.com: no legitimate reason to call dissolve_free_huge_pages() when !hugepages_supported()] Signed-off-by: Li Zhong Acked-by: Naoya Horiguchi Acked-by: David Rientjes Signed-off-by: David Rientjes Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9ad93b55585..eeceeeb09019 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1088,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct hstate *h; + if (!hugepages_supported()) + return; + /* Set scan step to minimum hugepage size */ for_each_hstate(h) if (order > huge_page_order(h)) -- cgit v1.2.3 From fb794bcbb4e5552242f9a4c5e1ffe4c6da29a968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:58 -0700 Subject: mm, oom: remove unnecessary exit_state check The oom killer scans each process and determines whether it is eligible for oom kill or whether the oom killer should abort because of concurrent memory freeing. It will abort when an eligible process is found to have TIF_MEMDIE set, meaning it has already been oom killed and we're waiting for it to exit. Processes with task->mm == NULL should not be considered because they are either kthreads or have already detached their memory and killing them would not lead to memory freeing. That memory is only freed after exit_mm() has returned, however, and not when task->mm is first set to NULL. Clear TIF_MEMDIE after exit_mm()'s mmput() so that an oom killed process is no longer considered for oom kill, but only until exit_mm() has returned. This was fragile in the past because it relied on exit_notify() to be reached before no longer considering TIF_MEMDIE processes. Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + mm/oom_kill.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..88c6b3e42583 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -455,6 +455,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d33aca1552ad..1e11df8fa7ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; -- cgit v1.2.3 From 7c0db9e917f77e6de2a524b33b5436491850dc79 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:08:01 -0700 Subject: mm, vmscan: fix an outdated comment still mentioning get_scan_ratio Quite a while ago, get_scan_ratio() has been renamed get_scan_count(), however a comment in shrink_active_list() still mention it. This patch fixes the outdated comment. Signed-off-by: Jerome Marchand Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c8222b499b4..88ab53c9949a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1756,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; -- cgit v1.2.3 From 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Aug 2014 16:08:03 -0700 Subject: memcg, vmscan: Fix forced scan of anonymous pages When memory cgoups are enabled, the code that decides to force to scan anonymous pages in get_scan_count() compares global values (free, high_watermark) to a value that is restricted to a memory cgroup (file). It make the code over-eager to force anon scan. For instance, it will force anon scan when scanning a memcg that is mainly populated by anonymous page, even when there is plenty of file pages to get rid of in others memcgs, even when swappiness == 0. It breaks user's expectation about swappiness and hurts performance. This patch makes sure that forced anon scan only happens when there not enough file pages for the all zone, not just in one random memcg. [hannes@cmpxchg.org: cleanups] Signed-off-by: Jerome Marchand Acked-by: Michal Hocko Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Mel Gorman Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 88ab53c9949a..d2f65c856350 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1930,11 +1930,6 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -1945,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1982,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; -- cgit v1.2.3 From aecd6f44266c13b8709245b21ded2d19291ab070 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Aug 2014 16:08:05 -0700 Subject: mm: close race between do_fault_around() and fault_around_bytes_set() Things can go wrong if fault_around_bytes will be changed under do_fault_around(): between fault_around_mask() and fault_around_pages(). Let's read fault_around_bytes only once during do_fault_around() and calculate mask based on the reading. Note: fault_around_bytes can only be updated via debug interface. Also I've tried but was not able to trigger a bad behaviour without the patch. So I would not consider this patch as urgent. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Andrey Ryabinin Cc: Sasha Levin Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 4d0a543f3bb3..dc47261c4686 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2768,16 +2768,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); -static inline unsigned long fault_around_pages(void) -{ - return fault_around_bytes >> PAGE_SHIFT; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~(fault_around_bytes - 1) & PAGE_MASK; -} - #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { @@ -2842,12 +2832,15 @@ late_initcall(fault_around_debugfs); static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { - unsigned long start_addr; + unsigned long start_addr, nr_pages, mask; pgoff_t max_pgoff; struct vm_fault vmf; int off; - start_addr = max(address & fault_around_mask(), vma->vm_start); + nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; @@ -2859,7 +2852,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + fault_around_pages() - 1); + pgoff + nr_pages - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { @@ -2894,7 +2887,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && - fault_around_pages() > 1) { + fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) -- cgit v1.2.3 From 3a91053aebb23205caf67927be00c54cef6424b3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Aug 2014 16:08:07 -0700 Subject: mm: mark fault_around_bytes __read_mostly fault_around_bytes can only be changed via debugfs. Let's mark it read-mostly. Signed-off-by: Kirill A. Shutemov Suggested-by: David Rientjes Acked-by: David Rientjes Cc: Dave Hansen Cc: Andrey Ryabinin Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index dc47261c4686..5596d77e8656 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2766,7 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) -- cgit v1.2.3 From 68b5a652485682f67eacdee3deae640fb7845b63 Mon Sep 17 00:00:00 2001 From: Peter Feiner Date: Wed, 6 Aug 2014 16:08:09 -0700 Subject: mm: softdirty: respect VM_SOFTDIRTY in PTE holes After a VMA is created with the VM_SOFTDIRTY flag set, /proc/pid/pagemap should report that the VMA's virtual pages are soft-dirty until VM_SOFTDIRTY is cleared (i.e., by the next write of "4" to /proc/pid/clear_refs). However, pagemap ignores the VM_SOFTDIRTY flag for virtual addresses that fall in PTE holes (i.e., virtual addresses that don't have a PMD, PUD, or PGD allocated yet). To observe this bug, use mmap to create a VMA large enough such that there's a good chance that the VMA will occupy an unused PMD, then test the soft-dirty bit on its pages. In practice, I found that a VMA that covered a PMD's worth of address space was big enough. This patch adds the necessary VMA lookup to the PTE hole callback in /proc/pid/pagemap's page walk and sets soft-dirty according to the VMAs' VM_SOFTDIRTY flag. Signed-off-by: Peter Feiner Acked-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Hugh Dickins Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cfa63ee92c96..dfc791c42d64 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -925,15 +925,30 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - unsigned long addr; + unsigned long addr = start; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + unsigned long vm_end; + + if (!vma) { + vm_end = end; + } else { + vm_end = min(end, vma->vm_end); + if (vma->vm_flags & VM_SOFTDIRTY) + pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + } + + for (; addr < vm_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } } + +out: return err; } -- cgit v1.2.3 From dbffcd03d77a3fb4d80a7981c7e589fc35769e9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 6 Aug 2014 16:08:12 -0700 Subject: mm: change confusing #ifdef use in __access_remote_vm This patch changes confusing #ifdef use in __access_remote_vm into merely ugly #ifdef use. Addresses bug https://bugzilla.kernel.org/show_bug.cgi?id=81651 Signed-off-by: Rik van Riel Reported-by: David Binderman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5596d77e8656..5c55270729f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3613,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ -#ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; @@ -3625,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) -#endif break; bytes = ret; +#endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); -- cgit v1.2.3 From 618fde872163e782183ce574c77f1123e2be8887 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 6 Aug 2014 16:08:14 -0700 Subject: kernel/smp.c:on_each_cpu_cond(): fix warning in fallback path The rarely-executed memry-allocation-failed callback path generates a WARN_ON_ONCE() when smp_call_function_single() succeeds. Presumably it's supposed to warn on failures. Signed-off-by: Sasha Levin Cc: Christoph Lameter Cc: Gilad Ben-Yossef Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index 487653b5844f..aff8aa14f547 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } -- cgit v1.2.3 From 61e02c745721a361ba238e70bfa1c84a4df1a4b7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 6 Aug 2014 16:08:16 -0700 Subject: mm: memcontrol: clean up reclaim size variable use in try_charge() Charge reclaim and OOM currently use the charge batch variable, but batching is already disabled at that point. To simplify the charge logic, the batch variable is reset to the original request size when reclaim is entered, so it's functionally equal, but it's misleading. Switch reclaim/OOM to nr_pages, which is the original request size. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6a062e409eb..90dc501eaf3f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2612,7 +2612,7 @@ retry: nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= batch) + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; if (gfp_mask & __GFP_NORETRY) @@ -2626,7 +2626,7 @@ retry: * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) goto retry; /* * At task move, charge accounts can be doubly counted. So, it's @@ -2644,7 +2644,7 @@ retry: if (fatal_signal_pending(current)) goto bypass; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -- cgit v1.2.3 From 1d352bfd41e8219cdf9bebe79677700bdc38b540 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Wed, 6 Aug 2014 16:08:18 -0700 Subject: mm: BUG when __kmap_atomic_idx equals KM_TYPE_NR __kmap_atomic_idx is per_cpu variable. Each CPU can use KM_TYPE_NR entries from FIXMAP i.e. from 0 to KM_TYPE_NR - 1. Allowing __kmap_atomic_idx to over- shoot to KM_TYPE_NR can mess up with next CPU's 0th entry which is a bug. Hence BUG_ON if __kmap_atomic_idx >= KM_TYPE_NR. Fix the off-by-on in this test. Signed-off-by: Chintan Pandya Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 7fb31da45d03..9286a46b7d69 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -93,7 +93,7 @@ static inline int kmap_atomic_idx_push(void) #ifdef CONFIG_DEBUG_HIGHMEM WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx > KM_TYPE_NR); + BUG_ON(idx >= KM_TYPE_NR); #endif return idx; } -- cgit v1.2.3 From b972216e27d1c853eced33f8638926636c606341 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Aug 2014 16:08:20 -0700 Subject: mmu_notifier: add call_srcu and sync function for listener to delay call and sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kernel device drivers or subsystems want to bind their lifespan to t= he lifespan of the mm_struct, they usually use one of the following methods: 1. Manually calling a function in the interested kernel module. The funct= ion call needs to be placed in mmput. This method was rejected by several ker= nel maintainers. 2. Registering to the mmu notifier release mechanism. The problem with the latter approach is that the mmu_notifier_release cal= lback is called from__mmu_notifier_release (called from exit_mmap). That functi= on iterates over the list of mmu notifiers and don't expect the release call= back function to remove itself from the list. Therefore, the callback function= in the kernel module can't release the mmu_notifier_object, which is actuall= y the kernel module's object itself. As a result, the destruction of the kernel module's object must to be done in a delayed fashion. This patch adds support for this delayed callback, by adding a new mmu_notifier_call_srcu function that receives a function ptr and calls th= at function with call_srcu. In that function, the kernel module releases its object. To use mmu_notifier_call_srcu, the calling module needs to call b= efore that a new function called mmu_notifier_unregister_no_release that as its= name implies, unregisters a notifier without calling its notifier release call= back. This patch also adds a function that will call barrier_srcu so those kern= el modules can sync with mmu_notifier. Signed-off-by: Peter Zijlstra Signed-off-by: Jérôme Glisse Signed-off-by: Oded Gabbay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 6 ++++++ mm/mmu_notifier.c | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index deca87452528..27288692241e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -170,6 +170,8 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); +extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, @@ -288,6 +290,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) set_pte_at(___mm, ___address, __ptep, ___pte); \ }) +extern void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)); +extern void mmu_notifier_synchronize(void); + #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf0aadd..950813b1eb36 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -22,6 +22,25 @@ /* global SRCU for all MMs */ static struct srcu_struct srcu; +/* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); /* * synchronize_srcu here prevents mmu_notifier_release from returning to @@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); -- cgit v1.2.3 From 15de36a4c3cf33aa4e194bfbff002048aa4a21c3 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Wed, 6 Aug 2014 16:08:23 -0700 Subject: mm/highmem: make kmap cache coloring aware User-visible effect: Architectures that choose this method of maintaining cache coherency (MIPS and xtensa currently) are able to use high memory on cores with aliasing data cache. Without this fix such architectures can not use high memory (in case of xtensa it means that at most 128 MBytes of physical memory is available). The problem: VIPT cache with way size larger than MMU page size may suffer from aliasing problem: a single physical address accessed via different virtual addresses may end up in multiple locations in the cache. Virtual mappings of a physical address that always get cached in different cache locations are said to have different colors. L1 caching hardware usually doesn't handle this situation leaving it up to software. Software must avoid this situation as it leads to data corruption. What can be done: One way to handle this is to flush and invalidate data cache every time page mapping changes color. The other way is to always map physical page at a virtual address with the same color. Low memory pages already have this property. Giving architecture a way to control color of high memory page mapping allows reusing of existing low memory cache alias handling code. How this is done with this patch: Provide hooks that allow architectures with aliasing cache to align mapping address of high pages according to their color. Such architectures may enforce similar coloring of low- and high-memory page mappings and reuse existing cache management functions to support highmem. This code is based on the implementation of similar feature for MIPS by Leonid Yegoshin. Signed-off-by: Max Filippov Cc: Leonid Yegoshin Cc: Chris Zankel Cc: Marc Gauthier Cc: David Rientjes Cc: Steven Hill Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 11 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index b32b70cdaed6..123bcd3ed4f2 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); */ #ifdef CONFIG_HIGHMEM +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) } static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a @@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); start: - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ @@ -181,12 +240,14 @@ start: */ { DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); + add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ @@ -274,6 +335,8 @@ void kunmap_high(struct page *page) unsigned long nr; unsigned long flags; int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); @@ -299,13 +362,14 @@ void kunmap_high(struct page *page) * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ - need_wakeup = waitqueue_active(&pkmap_map_wait); + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) - wake_up(&pkmap_map_wait); + wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); -- cgit v1.2.3 From cb8f2eec3c5c87e31219c5e58625b8e890004e48 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:08:25 -0700 Subject: zram: rename struct `table' to `zram_table_entry' Andrew Morton has recently noted that `struct table' actually represents table entry and, thus, should be renamed. Rename to `zram_table_entry'. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 7f21c145e317..8909f86caf0d 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -62,7 +62,7 @@ enum zram_pageflags { /*-- Data structures */ /* Allocated for each disk page */ -struct table { +struct zram_table_entry { unsigned long handle; u16 size; /* object size (excluding header) */ u8 flags; @@ -82,7 +82,7 @@ struct zram_stats { struct zram_meta { rwlock_t tb_lock; /* protect table */ - struct table *table; + struct zram_table_entry *table; struct zs_pool *mem_pool; }; -- cgit v1.2.3 From a830eff749eb2bf906783f6bf74a74dad3de3aea Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:08:27 -0700 Subject: zram: remove unused SECTOR_SIZE define Drop SECTOR_SIZE define, because it's not used. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8909f86caf0d..c8161bd8969c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,7 +43,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /*-- End of configurable params */ #define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 -- cgit v1.2.3 From 023b409f9dac4cdea3322009f2e592068558690c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Aug 2014 16:08:29 -0700 Subject: zram: use size_t instead of u16 Some architectures (eg, hexagon and PowerPC) could use PAGE_SHIFT of 16 or more. In these cases u16 is not sufficiently large to represent a compressed page's size so use size_t. Signed-off-by: Minchan Kim Reported-by: Weijie Yang Acked-by: Sergey Senozhatsky Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 36e54be402df..40743972eaf7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -337,7 +337,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - u16 size; + size_t size; read_lock(&meta->tb_lock); handle = meta->table[index].handle; -- cgit v1.2.3 From d2d5e762c8990c4031890e03565983a05febd64a Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 6 Aug 2014 16:08:31 -0700 Subject: zram: replace global tb_lock with fine grain lock Currently, we use a rwlock tb_lock to protect concurrent access to the whole zram meta table. However, according to the actual access model, there is only a small chance for upper user to access the same table[index], so the current lock granularity is too big. The idea of optimization is to change the lock granularity from whole meta table to per table entry (table -> table[index]), so that we can protect concurrent access to the same table[index], meanwhile allow the maximum concurrency. With this in mind, several kinds of locks which could be used as a per-entry lock were tested and compared: Test environment: x86-64 Intel Core2 Q8400, system memory 4GB, Ubuntu 12.04, kernel v3.15.0-rc3 as base, zram with 4 max_comp_streams LZO. iozone test: iozone -t 4 -R -r 16K -s 200M -I +Z (1GB zram with ext4 filesystem, take the average of 10 tests, KB/s) Test base CAS spinlock rwlock bit_spinlock ------------------------------------------------------------------- Initial write 1381094 1425435 1422860 1423075 1421521 Rewrite 1529479 1641199 1668762 1672855 1654910 Read 8468009 11324979 11305569 11117273 10997202 Re-read 8467476 11260914 11248059 11145336 10906486 Reverse Read 6821393 8106334 8282174 8279195 8109186 Stride read 7191093 8994306 9153982 8961224 9004434 Random read 7156353 8957932 9167098 8980465 8940476 Mixed workload 4172747 5680814 5927825 5489578 5972253 Random write 1483044 1605588 1594329 1600453 1596010 Pwrite 1276644 1303108 1311612 1314228 1300960 Pread 4324337 4632869 4618386 4457870 4500166 To enhance the possibility of access the same table[index] concurrently, set zram a small disksize(10MB) and let threads run with large loop count. fio test: fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers --scramble_buffers=1 --direct=1 --loops=3000 --numjobs=4 --filename=/dev/zram0 --name=seq-write --rw=write --stonewall --name=seq-read --rw=read --stonewall --name=seq-readwrite --rw=rw --stonewall --name=rand-readwrite --rw=randrw --stonewall (10MB zram raw block device, take the average of 10 tests, KB/s) Test base CAS spinlock rwlock bit_spinlock ------------------------------------------------------------- seq-write 933789 999357 1003298 995961 1001958 seq-read 5634130 6577930 6380861 6243912 6230006 seq-rw 1405687 1638117 1640256 1633903 1634459 rand-rw 1386119 1614664 1617211 1609267 1612471 All the optimization methods show a higher performance than the base, however, it is hard to say which method is the most appropriate. On the other hand, zram is mostly used on small embedded system, so we don't want to increase any memory footprint. This patch pick the bit_spinlock method, pack object size and page_flag into an unsigned long table.value, so as to not increase any memory overhead on both 32-bit and 64-bit system. On the third hand, even though different kinds of locks have different performances, we can ignore this difference, because: if zram is used as zram swapfile, the swap subsystem can prevent concurrent access to the same swapslot; if zram is used as zram-blk for set up filesystem on it, the upper filesystem and the page cache also prevent concurrent access of the same block mostly. So we can ignore the different performances among locks. Acked-by: Sergey Senozhatsky Reviewed-by: Davidlohr Bueso Signed-off-by: Weijie Yang Signed-off-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 69 ++++++++++++++++++++++++++----------------- drivers/block/zram/zram_drv.h | 24 +++++++++++---- 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 40743972eaf7..dfa4024c448a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -183,19 +183,32 @@ static ssize_t comp_algorithm_store(struct device *dev, static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - return meta->table[index].flags & BIT(flag); + return meta->table[index].value & BIT(flag); } static void zram_set_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags |= BIT(flag); + meta->table[index].value |= BIT(flag); } static void zram_clear_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags &= ~BIT(flag); + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } static inline int is_partial_io(struct bio_vec *bvec) @@ -255,7 +268,6 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } - rwlock_init(&meta->tb_lock); return meta; free_table: @@ -304,7 +316,12 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } -/* NOTE: caller should hold meta->tb_lock with write-side */ + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -324,11 +341,12 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; - meta->table[index].size = 0; + zram_set_obj_size(meta, index, 0); } static int zram_decompress_page(struct zram *zram, char *mem, u32 index) @@ -339,12 +357,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned long handle; size_t size; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; - size = meta->table[index].size; + size = zram_get_obj_size(meta, index); if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); clear_page(mem); return 0; } @@ -355,7 +373,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) else ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { @@ -376,14 +394,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); handle_zero_page(bvec); return 0; } - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -461,10 +479,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_ZERO); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.zero_pages); ret = 0; @@ -514,12 +532,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); meta->table[index].handle = handle; - meta->table[index].size = clen; - write_unlock(&zram->meta->tb_lock); + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ atomic64_add(clen, &zram->stats.compr_data_size); @@ -560,6 +578,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; + struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -580,13 +599,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - /* - * Discard request can be large so the lock hold times could be - * lengthy. So take the lock once per page. - */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); index++; n -= PAGE_SIZE; } @@ -821,9 +836,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - write_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.notify_free); } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c8161bd8969c..5b0afde729cd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -50,10 +50,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) -/* Flags for zram pages (table[page_no].flags) */ + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO, + ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, + ZRAM_ACCESS, /* page in now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -63,9 +77,8 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 flags; -} __aligned(4); + unsigned long value; +}; struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ @@ -80,7 +93,6 @@ struct zram_stats { }; struct zram_meta { - rwlock_t tb_lock; /* protect table */ struct zram_table_entry *table; struct zs_pool *mem_pool; }; -- cgit v1.2.3 From 99eef8e9369abe009006b4fa7f6ca5086c09cf46 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:33 -0700 Subject: mm/zbud: change zbud_alloc size type to size_t Change the type of the zbud_alloc() size param from unsigned int to size_t. Technically, this should not make any difference, as the zbud implementation already restricts the size to well within either type's limits; but as zsmalloc (and kmalloc) use size_t, and zpool will use size_t, this brings the size parameter type in line with zsmalloc/zpool. Signed-off-by: Dan Streetman Acked-by: Seth Jennings Tested-by: Seth Jennings Cc: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zbud.h | 2 +- mm/zbud.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/zbud.h b/include/linux/zbud.h index 13af0d450bf6..f9d41a6e361f 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h @@ -11,7 +11,7 @@ struct zbud_ops { struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); void zbud_destroy_pool(struct zbud_pool *pool); -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle); void zbud_free(struct zbud_pool *pool, unsigned long handle); int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); diff --git a/mm/zbud.c b/mm/zbud.c index 01df13a7e2e1..d01226117b8d 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -122,7 +122,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; -- cgit v1.2.3 From af8d417a04564bca0348e7e3c749ab12a3e837ad Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:36 -0700 Subject: mm/zpool: implement common zpool api to zbud/zsmalloc Add zpool api. zpool provides an interface for memory storage, typically of compressed memory. Users can select what backend to use; currently the only implementations are zbud, a low density implementation with up to two compressed pages per storage page, and zsmalloc, a higher density implementation with multiple compressed pages per storage page. Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 106 +++++++++++++++ mm/Kconfig | 41 +++--- mm/Makefile | 1 + mm/zpool.c | 364 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 1 - 5 files changed, 495 insertions(+), 18 deletions(-) create mode 100644 include/linux/zpool.h create mode 100644 mm/zpool.c diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 000000000000..f14bd75f08b3 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,106 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +int zpool_evict(void *pool, unsigned long handle); + +#endif diff --git a/mm/Kconfig b/mm/Kconfig index f4899ec39cf4..12179b8c3b89 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -519,15 +519,17 @@ config CMA_AREAS If unsure, leave the default value "7". -config ZBUD - tristate - default n +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" @@ -549,17 +551,22 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC tristate "Memory allocator for compressed pages" diff --git a/mm/Makefile b/mm/Makefile index 8338473c329a..632ae77e6070 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000000..e40612a1df00 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module(type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index bb62a4adc328..6a1827d3d231 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -240,7 +240,6 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -- cgit v1.2.3 From c795779df29e180738568d2a5eb3a42f3b5e47f0 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:38 -0700 Subject: mm/zpool: zbud/zsmalloc implement zpool Update zbud and zsmalloc to implement the zpool api. [fengguang.wu@intel.com: make functions static] Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) diff --git a/mm/zbud.c b/mm/zbud.c index d01226117b8d..a05790b1915e 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -51,6 +51,7 @@ #include #include #include +#include /***************** * Structures @@ -112,6 +113,90 @@ struct zbud_header { bool under_reclaim; }; +/***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + /***************** * Helpers *****************/ @@ -511,11 +596,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 6a1827d3d231..4e2fc83cb394 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include #include #include +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -240,6 +241,82 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -813,6 +890,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -839,6 +920,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); -- cgit v1.2.3 From 12d79d64bfd3913693304feb8636ccab504b9e63 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:40 -0700 Subject: mm/zpool: update zswap to use zpool Change zswap to use the zpool api instead of directly using zbud. Add a boot-time param to allow selecting which zpool implementation to use, with zbud as the default. Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- mm/zswap.c | 75 +++++++++++++++++++++++++++++++++++++------------------------- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 12179b8c3b89..886db2158538 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -535,7 +535,7 @@ config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes diff --git a/mm/zswap.c b/mm/zswap.c index 008388fe7b0f..032c21eeab2b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* zbud_pool is shared by all of zswap backend */ -static struct zbud_pool *zswap_pool; +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -168,7 +173,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zbud allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zbud allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(zswap_pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -409,7 +414,7 @@ cleanup: static bool zswap_is_full(void) { return totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages; + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(zswap_pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(zswap_pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(zswap_pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); - zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } if (!zswap_pool) { - pr_err("zbud pool creation failed\n"); + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); goto error; } + pr_info("using %s pool\n", zswap_zpool_type); if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -928,7 +943,7 @@ pcpufail: compfail: zswap_entry_cache_destory(); cachefail: - zbud_destroy_pool(zswap_pool); + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } -- cgit v1.2.3 From 69102311a57d1fd65cdc4002c55c5d551c799044 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 6 Aug 2014 16:08:43 -0700 Subject: ./Makefile: tell gcc optimizer to never introduce new data races We have been chasing a memory corruption bug, which turned out to be caused by very old gcc (4.3.4), which happily turned conditional load into a non-conditional one, and that broke correctness (the condition was met only if lock was held) and corrupted memory. This particular problem with that particular code did not happen when never gccs were used. I've brought this up with our gcc folks, as I wanted to make sure that this can't really happen again, and it turns out it actually can. Quoting Martin Jambor : "More current GCCs are more careful when it comes to replacing a conditional load with a non-conditional one, most notably they check that a store happens in each iteration of _a_ loop but they assume loops are executed. They also perform a simple check whether the store cannot trap which currently passes only for non-const variables. A simple testcase demonstrating it on an x86_64 is for example the following: $ cat cond_store.c int g_1 = 1; int g_2[1024] __attribute__((section ("safe_section"), aligned (4096))); int c = 4; int __attribute__ ((noinline)) foo (void) { int l; for (l = 0; (l != 4); l++) { if (g_1) return l; for (g_2[0] = 0; (g_2[0] >= 26); ++g_2[0]) ; } return 2; } int main (int argc, char* argv[]) { if (mprotect (g_2, sizeof(g_2), PROT_READ) == -1) { int e = errno; error (e, e, "mprotect error %i", e); } foo (); __builtin_printf("OK\n"); return 0; } /* EOF */ $ ~/gcc/trunk/inst/bin/gcc cond_store.c -O2 --param allow-store-data-races=0 $ ./a.out OK $ ~/gcc/trunk/inst/bin/gcc cond_store.c -O2 --param allow-store-data-races=1 $ ./a.out Segmentation fault The testcase fails the same at least with 4.9, 4.8 and 4.7. Therefore I would suggest building kernels with this parameter set to zero. I also agree with Jikos that the default should be changed for -O2. I have run most of the SPEC 2k6 CPU benchmarks (gamess and dealII failed, at -O2, not sure why) compiled with and without this option and did not see any real difference between respective run-times" Hopefully the default will be changed in newer gccs, but let's force it for kernel builds so that we are on a safe side even when older gcc are used. The code in question was out-of-tree printk-in-NMI (yeah, surprise suprise, once again) patch written by Petr Mladek, let me quote his comment from our internal bugzilla: "I have spent few days investigating inconsistent state of kernel ring buffer. It went out that it was caused by speculative store generated by gcc-4.3.4. The problem is in assembly generated for make_free_space(). The functions is called the following way: + vprintk_emit(); + log = MAIN_LOG; // with logbuf_lock or log = NMI_LOG; // with nmi_logbuf_lock cont_add(log, ...); + cont_flush(log, ...); + log_store(log, ...); + log_make_free_space(log, ...); If called with log = NMI_LOG then only nmi_log_* global variables are safe to modify but the generated code does store also into (main_)log_* global variables: : 55 push %rbp 89 f6 mov %esi,%esi 48 8b 05 03 99 51 01 mov 0x1519903(%rip),%rax # ffffffff82620868 44 8b 1d ec 98 51 01 mov 0x15198ec(%rip),%r11d # ffffffff82620858 8b 35 36 60 14 01 mov 0x1146036(%rip),%esi # ffffffff8224cfa8 44 8b 35 33 60 14 01 mov 0x1146033(%rip),%r14d # ffffffff8224cfac 4c 8b 2d d0 98 51 01 mov 0x15198d0(%rip),%r13 # ffffffff82620850 4c 8b 25 11 61 14 01 mov 0x1146111(%rip),%r12 # ffffffff8224d098 49 89 c2 mov %rax,%r10 48 21 c2 and %rax,%rdx 48 8b 1d 0c 99 55 01 mov 0x155990c(%rip),%rbx # ffffffff826608a0 49 c1 ea 20 shr $0x20,%r10 48 89 55 d0 mov %rdx,-0x30(%rbp) 44 29 de sub %r11d,%esi 45 29 d6 sub %r10d,%r14d 4c 8b 0d 97 98 51 01 mov 0x1519897(%rip),%r9 # ffffffff82620840 eb 7e jmp ffffffff81107029 [...] 85 ff test %edi,%edi # edi = 1 for NMI_LOG 4c 89 e8 mov %r13,%rax 4c 89 ca mov %r9,%rdx 74 0a je ffffffff8110703d 8b 15 27 98 51 01 mov 0x1519827(%rip),%edx # ffffffff82620860 48 8b 45 d0 mov -0x30(%rbp),%rax 48 39 c2 cmp %rax,%rdx # end of loop 0f 84 da 00 00 00 je ffffffff81107120 [...] 85 ff test %edi,%edi # edi = 1 for NMI_LOG 4c 89 0d 17 97 51 01 mov %r9,0x1519717(%rip) # ffffffff82620840 ^^^^^^^^^^^^^^^^^^^^^^^^^^ KABOOOM 74 35 je ffffffff81107160 It stores log_first_seq when edi == NMI_LOG. This instructions are used also when edi == MAIN_LOG but the store is done speculatively before the condition is decided. It is unsafe because we do not have "logbuf_lock" in NMI context and some other process migh modify "log_first_seq" in parallel" I believe that the best course of action is both - building kernel (and anything multi-threaded, I guess) with that optimization turned off - persuade gcc folks to change the default for future releases Signed-off-by: Jiri Kosina Cc: Martin Jambor Cc: Petr Mladek Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Marek Polacek Cc: Jakub Jelinek Cc: Steven Noonan Cc: Richard Biener Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index af2f2fcfe01c..a897c50db515 100644 --- a/Makefile +++ b/Makefile @@ -621,6 +621,9 @@ else KBUILD_CFLAGS += -O2 endif +# Tell gcc to never replace conditional load with a non-conditional one +KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function -- cgit v1.2.3 From 68be302963230fa76600cd598935a830ac95dca2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:08:45 -0700 Subject: fs.h, drivers/hwmon/asus_atk0110.c: fix DEFINE_SIMPLE_ATTRIBUTE semicolon definition and use The DEFINE_SIMPLE_ATTRIBUTE macro should not end in a ; Fix the one use in the kernel tree that did not have a semicolon. Signed-off-by: Joe Perches Acked-by: Guenter Roeck Acked-by: Luca Tettamanti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/asus_atk0110.c | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c index ae208f612198..cccef87963e0 100644 --- a/drivers/hwmon/asus_atk0110.c +++ b/drivers/hwmon/asus_atk0110.c @@ -688,7 +688,7 @@ static int atk_debugfs_gitm_get(void *p, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(atk_debugfs_gitm, atk_debugfs_gitm_get, NULL, - "0x%08llx\n") + "0x%08llx\n"); static int atk_acpi_print(char *buf, size_t sz, union acpi_object *obj) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 2daccaf4b547..1ab6c6913040 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2688,7 +2688,7 @@ static const struct file_operations __fops = { \ .read = simple_attr_read, \ .write = simple_attr_write, \ .llseek = generic_file_llseek, \ -}; +} static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) -- cgit v1.2.3 From 90a856436ddafbe0c6f8c18d7fc21aed3784e227 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Wed, 6 Aug 2014 16:08:47 -0700 Subject: include/linux/byteorder/generic.h: minor comment fix Signed-off-by: Geoff Levand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/byteorder/generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 0846e6b931ce..89f67c1c3160 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -2,7 +2,7 @@ #define _LINUX_BYTEORDER_GENERIC_H /* - * linux/byteorder_generic.h + * linux/byteorder/generic.h * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers -- cgit v1.2.3 From 7030017752437cebc3ec5590735bd89ead1e4cb8 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:49 -0700 Subject: printk: make dynamic kernel ring buffer alignment explicit We have to consider alignment for the ring buffer both for the default static size, and then also for when an dynamic allocation is made when the log_buf_len=n kernel parameter is passed to set the size specifically to a size larger than the default size set by the architecture through CONFIG_LOG_BUF_SHIFT. The default static kernel ring buffer can be aligned properly if architectures set CONFIG_LOG_BUF_SHIFT properly, we provide ranges for the size though so even if CONFIG_LOG_BUF_SHIFT has a sensible aligned value it can be reduced to a non aligned value. Commit 6ebb017de9 ("printk: Fix alignment of buf causing crash on ARM EABI") by Andrew Lunn ensures the static buffer is always aligned and the decision of alignment is done by the compiler by using __alignof__(struct log). When log_buf_len=n is used we allocate the ring buffer dynamically. Dynamic allocation varies, for the early allocation called before setup_arch() memblock_virt_alloc() requests a page aligment and for the default kernel allocation memblock_virt_alloc_nopanic() requests no special alignment, which in turn ends up aligning the allocation to SMP_CACHE_BYTES, which is L1 cache aligned. Since we already have the required alignment for the kernel ring buffer though we can do better and request explicit alignment for LOG_ALIGN. This does that to be safe and make dynamic allocation alignment explicit. Signed-off-by: Luis R. Rodriguez Tested-by: Petr Mladek Acked-by: Petr Mladek Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..6f598f92f2a1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -853,9 +853,10 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { -- cgit v1.2.3 From c0a318a361e7652b8c4f7b91d3a31c771cf34e4f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:52 -0700 Subject: printk: move power of 2 practice of ring buffer size to a helper In practice the power of 2 practice of the size of the kernel ring buffer remains purely historical but not a requirement, specially now that we have LOG_ALIGN and use it for both static and dynamic allocations. It could have helped with implicit alignment back in the days given the even the dynamically sized ring buffer was guaranteed to be aligned so long as CONFIG_LOG_BUF_SHIFT was set to produce a __LOG_BUF_LEN which is architecture aligned, since log_buf_len=n would be allowed only if it was > __LOG_BUF_LEN and we always ended up rounding the log_buf_len=n to the next power of 2 with roundup_pow_of_two(), any multiple of 2 then should be also architecture aligned. These assumptions of course relied heavily on CONFIG_LOG_BUF_SHIFT producing an aligned value but users can always change this. We now have precise alignment requirements set for the log buffer size for both static and dynamic allocations, but lets upkeep the old practice of using powers of 2 for its size to help with easy expected scalable values and the allocators for dynamic allocations. We'll reuse this later so move this into a helper. Signed-off-by: Luis R. Rodriguez Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6f598f92f2a1..32ad0c7a0cd3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -828,15 +828,21 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } -- cgit v1.2.3 From f54051722e5715d24cd4469606ebdf488b6d5779 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:54 -0700 Subject: printk: make dynamic units clear for the kernel ring buffer Signed-off-by: Luis R. Rodriguez Suggested-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 32ad0c7a0cd3..db290be32984 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -879,7 +879,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } -- cgit v1.2.3 From 23b2899f7f194f06e09b52a1f46f027a21fae17c Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:56 -0700 Subject: printk: allow increasing the ring buffer depending on the number of CPUs The default size of the ring buffer is too small for machines with a large amount of CPUs under heavy load. What ends up happening when debugging is the ring buffer overlaps and chews up old messages making debugging impossible unless the size is passed as a kernel parameter. An idle system upon boot up will on average spew out only about one or two extra lines but where this really matters is on heavy load and that will vary widely depending on the system and environment. There are mechanisms to help increase the kernel ring buffer for tracing through debugfs, and those interfaces even allow growing the kernel ring buffer per CPU. We also have a static value which can be passed upon boot. Relying on debugfs however is not ideal for production, and relying on the value passed upon bootup is can only used *after* an issue has creeped up. Instead of being reactive this adds a proactive measure which lets you scale the amount of contributions you'd expect to the kernel ring buffer under load by each CPU in the worst case scenario. We use num_possible_cpus() to avoid complexities which could be introduced by dynamically changing the ring buffer size at run time, num_possible_cpus() lets us use the upper limit on possible number of CPUs therefore avoiding having to deal with hotplugging CPUs on and off. This introduces the kernel configuration option LOG_CPU_MAX_BUF_SHIFT which is used to specify the maximum amount of contributions to the kernel ring buffer in the worst case before the kernel ring buffer flips over, the size is specified as a power of 2. The total amount of contributions made by each CPU must be greater than half of the default kernel ring buffer size (1 << LOG_BUF_SHIFT bytes) in order to trigger an increase upon bootup. The kernel ring buffer is increased to the next power of two that would fit the required minimum kernel ring buffer size plus the additional CPU contribution. For example if LOG_BUF_SHIFT is 18 (256 KB) you'd require at least 128 KB contributions by other CPUs in order to trigger an increase of the kernel ring buffer. With a LOG_CPU_BUF_SHIFT of 12 (4 KB) you'd require at least anything over > 64 possible CPUs to trigger an increase. If you had 128 possible CPUs the amount of minimum required kernel ring buffer bumps to: ((1 << 18) + ((128 - 1) * (1 << 12))) / 1024 = 764 KB Since we require the ring buffer to be a power of two the new required size would be 1024 KB. This CPU contributions are ignored when the "log_buf_len" kernel parameter is used as it forces the exact size of the ring buffer to an expected power of two value. [pmladek@suse.cz: fix build] Signed-off-by: Luis R. Rodriguez Signed-off-by: Petr Mladek Tested-by: Davidlohr Bueso Tested-by: Petr Mladek Reviewed-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 8 +++++-- init/Kconfig | 46 +++++++++++++++++++++++++++++++++---- kernel/printk/printk.c | 34 +++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 6 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 883901b9ac4f..9344d833b7ea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1716,8 +1716,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. 7 (KERN_DEBUG) debug-level messages log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for diff --git a/init/Kconfig b/init/Kconfig index 41066e49e880..a291b7ef4738 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -807,15 +807,53 @@ config LOG_BUF_SHIFT range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db290be32984..f855ec36dff9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -266,6 +266,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -848,12 +849,45 @@ static int __init log_buf_len_setup(char *str) } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; -- cgit v1.2.3 From e97e1267e9faa6480898a1fc34c8e40d74d702f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:08:59 -0700 Subject: printk: tweak do_syslog() to match comments In do_syslog() there's a path used by kmsg_poll() and kmsg_read() that only needs to know whether there's any data available to read (and not its size). These callers only check for non-zero return. As a shortcut, do_syslog() returns the difference between what has been logged and what has been "seen." The comments say that the "count of records" should be returned but it's not. Instead it returns (log_next_idx - syslog_idx), which is a difference between buffer offsets--and the result could be negative. The behavior is the same (it'll be zero or not in the same cases), but the count of records is more meaningful and it matches what the comments say. So change the code to return that. Signed-off-by: Alex Elder Cc: Petr Mladek Cc: Jan Kara Cc: Joe Perches Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f855ec36dff9..ec3bfb0b1f62 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1351,7 +1351,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; -- cgit v1.2.3 From 42a9dc0b3d0f749375c767c7d5cab56e89160576 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:01 -0700 Subject: printk: rename DEFAULT_MESSAGE_LOGLEVEL Commit a8fe19ebfbfd ("kernel/printk: use symbolic defines for console loglevels") makes consistent use of symbolic values for printk() log levels. The naming scheme used is different from the one used for DEFAULT_MESSAGE_LOGLEVEL though. Change that symbol name to be MESSAGE_LOGLEVEL_DEFAULT for consistency. And because the value of that symbol comes from a similarly-named config option, rename CONFIG_DEFAULT_MESSAGE_LOGLEVEL as well. Signed-off-by: Alex Elder Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 +- kernel/printk/printk.c | 2 +- lib/Kconfig.debug | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 319ff7e53efb..0990997a5304 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,7 +31,7 @@ static inline const char *printk_skip_level(const char *buffer) } /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL +#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ec3bfb0b1f62..770ed4821ba9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -56,7 +56,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cfe7df8f62cc..cb45f59685e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,7 +15,7 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt -config DEFAULT_MESSAGE_LOGLEVEL +config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" -- cgit v1.2.3 From 0b90fec3b990b50d77944bc73c1ba4b031dfa52f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:03 -0700 Subject: printk: fix some comments Fix a few comments that don't accurately describe their corresponding code. It also fixes some minor typographical errors. Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 770ed4821ba9..4bae344c1ec3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -113,9 +113,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +146,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -345,7 +345,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -1517,7 +1517,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1922,11 +1922,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -2086,8 +2087,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ -- cgit v1.2.3 From e99aa461660a6413b11da887fb499e04a0f46803 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:05 -0700 Subject: printk: use a clever macro Use the IS_ENABLED() macro rather than #ifdef blocks to set certain global values. Signed-off-by: Alex Elder Acked-by: Borislav Petkov Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4bae344c1ec3..ac86838227ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -454,11 +454,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -988,11 +984,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) -- cgit v1.2.3 From 249771b8307e7a91659d8b273f8b70d48c3a7bfc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:08 -0700 Subject: printk: miscellaneous cleanups Some small cleanups to kernel/printk/printk.c. None of them should cause any change in behavior. - When CONFIG_PRINTK is defined, parenthesize the value of LOG_LINE_MAX. - When CONFIG_PRINTK is *not* defined, there is an extra LOG_LINE_MAX definition; delete it. - Pull an assignment out of a conditional expression in console_setup(). - Use isdigit() in console_setup() rather than open coding it. - In update_console_cmdline(), drop a NUL-termination assignment; the strlcpy() call that precedes it guarantees it's not needed. - Simplify some logic in printk_timed_ratelimit(). Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ac86838227ed..5eb0e6c800bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -1835,7 +1836,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1936,7 +1937,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1945,7 +1947,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1984,7 +1986,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -2652,14 +2653,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.2.3 From 5874af2003b1aaaa053128d655710140e3187226 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Aug 2014 16:09:10 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock(). We differ from (reverted) commit 939f04bec1a4 in that we avoid calling console_unlock() from vprintk_emit() with lockdep enabled as that has unveiled quite some bugs leading to system freezes during boot (e.g. https://lkml.org/lkml/2014/5/30/242, https://lkml.org/lkml/2014/6/28/521). Signed-off-by: Jan Kara Tested-by: Andreas Bombe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5eb0e6c800bb..df202fe0974a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1450,10 +1450,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1466,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1642,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1750,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From d25d9feced6c94398979a035868f03e8e8d49ce8 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Wed, 6 Aug 2014 16:09:12 -0700 Subject: kernel/printk/printk.c: fix bool assignements Fix coccinelle warnings. Signed-off-by: Neil Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index df202fe0974a..de1a6bb6861d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -919,7 +919,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -2005,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); -- cgit v1.2.3 From bc18dd335a161f9229ed3aaab88ce0706cbd9867 Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:14 -0700 Subject: list: make hlist_add_after() argument names match hlist_add_after_rcu() The argument names for hlist_add_after() are poorly chosen because they look the same as the ones for hlist_add_before() but have to be used differently. hlist_add_after_rcu() has made a better choice. Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index ef9594171062..624ec7f48293 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -654,15 +654,15 @@ static inline void hlist_add_before(struct hlist_node *n, *(n->pprev) = n; } -static inline void hlist_add_after(struct hlist_node *n, - struct hlist_node *next) +static inline void hlist_add_after(struct hlist_node *prev, + struct hlist_node *n) { - next->next = n->next; - n->next = next; - next->pprev = &n->next; + n->next = prev->next; + prev->next = n; + n->pprev = &prev->next; - if(next->next) - next->next->pprev = &next->next; + if (n->next) + n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ -- cgit v1.2.3 From 1d023284c31a4e40a94d5bbcb7dbb7a35ee0bcbc Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:16 -0700 Subject: list: fix order of arguments for hlist_add_after(_rcu) All other add functions for lists have the new item as first argument and the position where it is added as second argument. This was changed for no good reason in this function and makes using it unnecessary confusing. The name was changed to hlist_add_behind() to cause unconverted code to generate a compile error instead of using the wrong parameter order. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Acked-by: Jeff Kirsher [intel driver bits] Cc: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/RCU/whatisRCU.txt | 2 +- drivers/gpu/drm/drm_hashtab.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 2 +- drivers/staging/lustre/lustre/libcfs/hash.c | 4 ++-- fs/namespace.c | 2 +- fs/notify/inode_mark.c | 2 +- fs/notify/vfsmount_mark.c | 2 +- include/linux/list.h | 4 ++-- include/linux/rculist.h | 8 ++++---- net/batman-adv/fragmentation.c | 2 +- net/bridge/br_multicast.c | 2 +- net/ipv4/fib_trie.c | 2 +- net/ipv6/addrlabel.c | 2 +- net/xfrm/xfrm_policy.c | 4 ++-- 15 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 49b8551a3b68..e48c57f1943b 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -818,7 +818,7 @@ RCU pointer/list update: list_add_tail_rcu list_del_rcu list_replace_rcu - hlist_add_after_rcu + hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu hlist_del_rcu diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index 7e4bae760e27..c3b80fd65d62 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -125,7 +125,7 @@ int drm_ht_insert_item(struct drm_open_hash *ht, struct drm_hash_item *item) parent = &entry->head; } if (parent) { - hlist_add_after_rcu(parent, &item->head); + hlist_add_behind_rcu(&item->head, parent); } else { hlist_add_head_rcu(&item->head, h_list); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 681a9e81ff51..e8ba7470700a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1948,7 +1948,7 @@ static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 94a1c07efeb0..e4100b5737b6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2517,7 +2517,7 @@ static int ixgbe_update_ethtool_fdir_entry(struct ixgbe_adapter *adapter, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &adapter->fdir_filter_list); diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c index 5dde79418297..8ef1deb59d4a 100644 --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@ -351,7 +351,7 @@ cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_t, dh_head); if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@ -406,7 +406,7 @@ cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_dep_t, dd_head); if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd887..2a1447c946e7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,7 +798,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be65b7b..9ce062218de9 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8704b5..ac851e8376b1 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/include/linux/list.h b/include/linux/list.h index 624ec7f48293..cbbb96fcead9 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -654,8 +654,8 @@ static inline void hlist_add_before(struct hlist_node *n, *(n->pprev) = n; } -static inline void hlist_add_after(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; prev->next = n; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8183b46fbaa2..372ad5e0dcb8 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -432,9 +432,9 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, } /** - * hlist_add_after_rcu - * @prev: the existing element to add the new element after. + * hlist_add_behind_rcu * @n: the new element to add to the hash list. + * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist @@ -449,8 +449,8 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ -static inline void hlist_add_after_rcu(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind_rcu(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; n->pprev = &prev->next; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 022d18ab27a6..52c43f904220 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { - hlist_add_after(&frag_entry_last->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_last->list, &frag_entry_new->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b4845f4b2bb4..7751c92c8c57 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7c..e9cb2588e416 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d9..fd0dc47f471d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78ba328..beeed602aeb3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -389,7 +389,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -654,7 +654,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); -- cgit v1.2.3 From 0f9859ca92c9182bcb8f18c55cae1a04627cbb59 Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:18 -0700 Subject: klist: use same naming scheme as hlist for klist_add_after() The name was modified from hlist_add_after() to hlist_add_behind() when adjusting the order of arguments to match the one with klist_add_after(). This is necessary to break old code when it would use it the wrong way. Make klist follow this naming scheme for consistency. Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/klist.h | 2 +- lib/klist.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/klist.h b/include/linux/klist.h index a370ce57cf1d..61e5b723ae73 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -44,7 +44,7 @@ struct klist_node { extern void klist_add_tail(struct klist_node *n, struct klist *k); extern void klist_add_head(struct klist_node *n, struct klist *k); -extern void klist_add_after(struct klist_node *n, struct klist_node *pos); +extern void klist_add_behind(struct klist_node *n, struct klist_node *pos); extern void klist_add_before(struct klist_node *n, struct klist_node *pos); extern void klist_del(struct klist_node *n); diff --git a/lib/klist.c b/lib/klist.c index 358a368a2947..89b485a2a58d 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -140,11 +140,11 @@ void klist_add_tail(struct klist_node *n, struct klist *k) EXPORT_SYMBOL_GPL(klist_add_tail); /** - * klist_add_after - Init a klist_node and add it after an existing node + * klist_add_behind - Init a klist_node and add it after an existing node * @n: node we're adding. * @pos: node to put @n after */ -void klist_add_after(struct klist_node *n, struct klist_node *pos) +void klist_add_behind(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); @@ -153,7 +153,7 @@ void klist_add_after(struct klist_node *n, struct klist_node *pos) list_add(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } -EXPORT_SYMBOL_GPL(klist_add_after); +EXPORT_SYMBOL_GPL(klist_add_behind); /** * klist_add_before - Init a klist_node and add it before an existing node -- cgit v1.2.3 From 62e7ca5280fd8cbf523970757e13f0324ce0daa0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:09:21 -0700 Subject: zlib: clean up some dead code Cleanup unused `if 0'-ed functions, which have been dead since 2006 (commits 87c2ce3b9305 ("lib/zlib*: cleanups") by Adrian Bunk and 4f3865fb57a0 ("zlib_inflate: Upgrade library code to a recent version") by Richard Purdie): - zlib_deflateSetDictionary - zlib_deflateParams - zlib_deflateCopy - zlib_inflateSync - zlib_syncsearch - zlib_inflateSetDictionary - zlib_inflatePrime Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zlib.h | 118 ------------------------------------- lib/zlib_deflate/deflate.c | 143 --------------------------------------------- lib/zlib_inflate/inflate.c | 132 ----------------------------------------- 3 files changed, 393 deletions(-) diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 9c5a6b4de0a3..197abb2a54c5 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -493,64 +493,6 @@ extern int deflateInit2 (z_streamp strm, method). msg is set to null if there is no error message. deflateInit2 does not perform any compression: this will be done by deflate(). */ - -#if 0 -extern int zlib_deflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -#endif -/* - Initializes the compression dictionary from the given byte sequence - without producing any compressed output. This function must be called - immediately after deflateInit, deflateInit2 or deflateReset, before any - call of deflate. The compressor and decompressor must use exactly the same - dictionary (see inflateSetDictionary). - - The dictionary should consist of strings (byte sequences) that are likely - to be encountered later in the data to be compressed, with the most commonly - used strings preferably put towards the end of the dictionary. Using a - dictionary is most useful when the data to be compressed is short and can be - predicted with good accuracy; the data can then be compressed better than - with the default empty dictionary. - - Depending on the size of the compression data structures selected by - deflateInit or deflateInit2, a part of the dictionary may in effect be - discarded, for example if the dictionary is larger than the window size in - deflate or deflate2. Thus the strings most likely to be useful should be - put at the end of the dictionary, not at the front. - - Upon return of this function, strm->adler is set to the Adler32 value - of the dictionary; the decompressor may later use this value to determine - which dictionary has been used by the compressor. (The Adler32 value - applies to the whole dictionary even if only a subset of the dictionary is - actually used by the compressor.) - - deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent (for example if deflate has already been called for this stream - or if the compression method is bsort). deflateSetDictionary does not - perform any compression: this will be done by deflate(). -*/ - -#if 0 -extern int zlib_deflateCopy (z_streamp dest, z_streamp source); -#endif - -/* - Sets the destination stream as a complete copy of the source stream. - - This function can be useful when several compression strategies will be - tried, for example when there are several ways of pre-processing the input - data with a filter. The streams that will be discarded should then be freed - by calling deflateEnd. Note that deflateCopy duplicates the internal - compression state which can be quite large, so this strategy is slow and - can consume lots of memory. - - deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not - enough memory, Z_STREAM_ERROR if the source stream state was inconsistent - (such as zalloc being NULL). msg is left unchanged in both source and - destination. -*/ extern int zlib_deflateReset (z_streamp strm); /* @@ -568,27 +510,6 @@ static inline unsigned long deflateBound(unsigned long s) return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11; } -#if 0 -extern int zlib_deflateParams (z_streamp strm, int level, int strategy); -#endif -/* - Dynamically update the compression level and compression strategy. The - interpretation of level and strategy is as in deflateInit2. This can be - used to switch between compression and straight copy of the input data, or - to switch to a different kind of input data requiring a different - strategy. If the compression level is changed, the input available so far - is compressed with the old level (and may be flushed); the new level will - take effect only at the next call of deflate(). - - Before the call of deflateParams, the stream state must be set as for - a call of deflate(), since the currently available input may have to - be compressed and flushed. In particular, strm->avail_out must be non-zero. - - deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source - stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR - if strm->avail_out was zero. -*/ - /* extern int inflateInit2 (z_streamp strm, int windowBits); @@ -631,45 +552,6 @@ extern int inflateInit2 (z_streamp strm, int windowBits); and avail_out are unchanged.) */ -extern int zlib_inflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -/* - Initializes the decompression dictionary from the given uncompressed byte - sequence. This function must be called immediately after a call of inflate, - if that call returned Z_NEED_DICT. The dictionary chosen by the compressor - can be determined from the adler32 value returned by that call of inflate. - The compressor and decompressor must use exactly the same dictionary (see - deflateSetDictionary). For raw inflate, this function can be called - immediately after inflateInit2() or inflateReset() and before any call of - inflate() to set the dictionary. The application must insure that the - dictionary that was used for compression is provided. - - inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the - expected one (incorrect adler32 value). inflateSetDictionary does not - perform any decompression: this will be done by subsequent calls of - inflate(). -*/ - -#if 0 -extern int zlib_inflateSync (z_streamp strm); -#endif -/* - Skips invalid compressed data until a full flush point (see above the - description of deflate with Z_FULL_FLUSH) can be found, or until all - available input is skipped. No output is provided. - - inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR - if no more input was provided, Z_DATA_ERROR if no flush point has been found, - or Z_STREAM_ERROR if the stream structure was inconsistent. In the success - case, the application may save the current current value of total_in which - indicates where valid compressed data was found. In the error case, the - application may repeatedly call inflateSync, providing more input each time, - until success or end of the input data. -*/ - extern int zlib_inflateReset (z_streamp strm); /* This function is equivalent to inflateEnd followed by inflateInit, diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index d63381e8e333..d20ef458f137 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -249,52 +249,6 @@ int zlib_deflateInit2( return zlib_deflateReset(strm); } -/* ========================================================================= */ -#if 0 -int zlib_deflateSetDictionary( - z_streamp strm, - const Byte *dictionary, - uInt dictLength -) -{ - deflate_state *s; - uInt length = dictLength; - uInt n; - IPos hash_head = 0; - - if (strm == NULL || strm->state == NULL || dictionary == NULL) - return Z_STREAM_ERROR; - - s = (deflate_state *) strm->state; - if (s->status != INIT_STATE) return Z_STREAM_ERROR; - - strm->adler = zlib_adler32(strm->adler, dictionary, dictLength); - - if (length < MIN_MATCH) return Z_OK; - if (length > MAX_DIST(s)) { - length = MAX_DIST(s); -#ifndef USE_DICT_HEAD - dictionary += dictLength - length; /* use the tail of the dictionary */ -#endif - } - memcpy((char *)s->window, dictionary, length); - s->strstart = length; - s->block_start = (long)length; - - /* Insert all strings in the hash table (except for the last two bytes). - * s->lookahead stays null, so s->ins_h will be recomputed at the next - * call of fill_window. - */ - s->ins_h = s->window[0]; - UPDATE_HASH(s, s->ins_h, s->window[1]); - for (n = 0; n <= length - MIN_MATCH; n++) { - INSERT_STRING(s, n, hash_head); - } - if (hash_head) hash_head = 0; /* to make compiler happy */ - return Z_OK; -} -#endif /* 0 */ - /* ========================================================================= */ int zlib_deflateReset( z_streamp strm @@ -326,45 +280,6 @@ int zlib_deflateReset( return Z_OK; } -/* ========================================================================= */ -#if 0 -int zlib_deflateParams( - z_streamp strm, - int level, - int strategy -) -{ - deflate_state *s; - compress_func func; - int err = Z_OK; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - s = (deflate_state *) strm->state; - - if (level == Z_DEFAULT_COMPRESSION) { - level = 6; - } - if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { - return Z_STREAM_ERROR; - } - func = configuration_table[s->level].func; - - if (func != configuration_table[level].func && strm->total_in != 0) { - /* Flush the last buffer: */ - err = zlib_deflate(strm, Z_PARTIAL_FLUSH); - } - if (s->level != level) { - s->level = level; - s->max_lazy_match = configuration_table[level].max_lazy; - s->good_match = configuration_table[level].good_length; - s->nice_match = configuration_table[level].nice_length; - s->max_chain_length = configuration_table[level].max_chain; - } - s->strategy = strategy; - return err; -} -#endif /* 0 */ - /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in @@ -568,64 +483,6 @@ int zlib_deflateEnd( return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } -/* ========================================================================= - * Copy the source state to the destination state. - */ -#if 0 -int zlib_deflateCopy ( - z_streamp dest, - z_streamp source -) -{ -#ifdef MAXSEG_64K - return Z_STREAM_ERROR; -#else - deflate_state *ds; - deflate_state *ss; - ush *overlay; - deflate_workspace *mem; - - - if (source == NULL || dest == NULL || source->state == NULL) { - return Z_STREAM_ERROR; - } - - ss = (deflate_state *) source->state; - - *dest = *source; - - mem = (deflate_workspace *) dest->workspace; - - ds = &(mem->deflate_memory); - - dest->state = (struct internal_state *) ds; - *ds = *ss; - ds->strm = dest; - - ds->window = (Byte *) mem->window_memory; - ds->prev = (Pos *) mem->prev_memory; - ds->head = (Pos *) mem->head_memory; - overlay = (ush *) mem->overlay_memory; - ds->pending_buf = (uch *) overlay; - - memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); - memcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); - memcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); - memcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); - - ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); - ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); - ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; - - ds->l_desc.dyn_tree = ds->dyn_ltree; - ds->d_desc.dyn_tree = ds->dyn_dtree; - ds->bl_desc.dyn_tree = ds->bl_tree; - - return Z_OK; -#endif -} -#endif /* 0 */ - /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index f5ce87b0800e..58a733b10387 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -45,21 +45,6 @@ int zlib_inflateReset(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflatePrime(z_streamp strm, int bits, int value) -{ - struct inflate_state *state; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; - value &= (1L << bits) - 1; - state->hold += value << state->bits; - state->bits += bits; - return Z_OK; -} -#endif - int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; @@ -761,123 +746,6 @@ int zlib_inflateEnd(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary, - uInt dictLength) -{ - struct inflate_state *state; - unsigned long id; - - /* check state */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (state->wrap != 0 && state->mode != DICT) - return Z_STREAM_ERROR; - - /* check for correct dictionary id */ - if (state->mode == DICT) { - id = zlib_adler32(0L, NULL, 0); - id = zlib_adler32(id, dictionary, dictLength); - if (id != state->check) - return Z_DATA_ERROR; - } - - /* copy dictionary to window */ - zlib_updatewindow(strm, strm->avail_out); - - if (dictLength > state->wsize) { - memcpy(state->window, dictionary + dictLength - state->wsize, - state->wsize); - state->whave = state->wsize; - } - else { - memcpy(state->window + state->wsize - dictLength, dictionary, - dictLength); - state->whave = dictLength; - } - state->havedict = 1; - return Z_OK; -} -#endif - -#if 0 -/* - Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found - or when out of input. When called, *have is the number of pattern bytes - found in order so far, in 0..3. On return *have is updated to the new - state. If on return *have equals four, then the pattern was found and the - return value is how many bytes were read including the last byte of the - pattern. If *have is less than four, then the pattern has not been found - yet and the return value is len. In the latter case, zlib_syncsearch() can be - called again with more data and the *have state. *have is initialized to - zero for the first call. - */ -static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf, - unsigned len) -{ - unsigned got; - unsigned next; - - got = *have; - next = 0; - while (next < len && got < 4) { - if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) - got++; - else if (buf[next]) - got = 0; - else - got = 4 - got; - next++; - } - *have = got; - return next; -} -#endif - -#if 0 -int zlib_inflateSync(z_streamp strm) -{ - unsigned len; /* number of bytes to look at or looked at */ - unsigned long in, out; /* temporary to save total_in and total_out */ - unsigned char buf[4]; /* to restore bit buffer to byte string */ - struct inflate_state *state; - - /* check parameters */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; - - /* if first time, start search in bit buffer */ - if (state->mode != SYNC) { - state->mode = SYNC; - state->hold <<= state->bits & 7; - state->bits -= state->bits & 7; - len = 0; - while (state->bits >= 8) { - buf[len++] = (unsigned char)(state->hold); - state->hold >>= 8; - state->bits -= 8; - } - state->have = 0; - zlib_syncsearch(&(state->have), buf, len); - } - - /* search available input */ - len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in); - strm->avail_in -= len; - strm->next_in += len; - strm->total_in += len; - - /* return no joy or set up to restart inflate() on a new block */ - if (state->have != 4) return Z_DATA_ERROR; - in = strm->total_in; out = strm->total_out; - zlib_inflateReset(strm); - strm->total_in = in; strm->total_out = out; - state->mode = TYPE; - return Z_OK; -} -#endif - /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; -- cgit v1.2.3 From b01250856b25f4417c51aa33afc451fbf7da1484 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 6 Aug 2014 16:09:23 -0700 Subject: lib: add lib/glob.c This is a helper function from drivers/ata/libata_core.c, where it is used to blacklist particular device models. It's being moved to lib/ so other drivers may use it for the same purpose. This implementation in non-recursive, so is safe for the kernel stack. [akpm@linux-foundation.org: fix sparse warning] Signed-off-by: George Spelvin Cc: Randy Dunlap Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/glob.h | 9 ++++ lib/Kconfig | 19 ++++++++ lib/Makefile | 2 + lib/glob.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 include/linux/glob.h create mode 100644 lib/glob.c diff --git a/include/linux/glob.h b/include/linux/glob.h new file mode 100644 index 000000000000..861d8347d08e --- /dev/null +++ b/include/linux/glob.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_GLOB_H +#define _LINUX_GLOB_H + +#include /* For bool */ +#include /* For __pure */ + +bool __pure glob_match(char const *pat, char const *str); + +#endif /* _LINUX_GLOB_H */ diff --git a/lib/Kconfig b/lib/Kconfig index a8a775730c09..41bfeec72e40 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -396,6 +396,25 @@ config CPU_RMAP config DQL bool +config GLOB + bool +# This actually supports modular compilation, but the module overhead +# is ridiculous for the amount of code involved. Until an out-of-tree +# driver asks for it, we'll just link it directly it into the kernel +# when required. Since we're ignoring out-of-tree users, there's also +# no need bother prompting for a manual decision: +# prompt "glob_match() function" + help + This option provides a glob_match function for performing + simple text pattern matching. It originated in the ATA code + to blacklist particular drive models, but other device drivers + may need similar functionality. + + All drivers in the Linux kernel tree that require this function + should automatically select this option. Say N unless you + are compiling an out-of tree driver which tells you that it + depends on this. + # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/Makefile b/lib/Makefile index 8427df95dade..d6b4bc496408 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -137,6 +137,8 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o +obj-$(CONFIG_GLOB) += glob.o + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o diff --git a/lib/glob.c b/lib/glob.c new file mode 100644 index 000000000000..0ba3ea86b546 --- /dev/null +++ b/lib/glob.c @@ -0,0 +1,123 @@ +#include +#include + +/* + * The only reason this code can be compiled as a module is because the + * ATA code that depends on it can be as well. In practice, they're + * both usually compiled in and the module overhead goes away. + */ +MODULE_DESCRIPTION("glob(7) matching"); +MODULE_LICENSE("Dual MIT/GPL"); + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +bool __pure glob_match(char const *pat, char const *str) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const *back_pat = NULL, *back_str = back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + /*FALLTHROUGH*/ + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + } +} +EXPORT_SYMBOL(glob_match); -- cgit v1.2.3 From 5f9be8248d9802730185f98f4db41285dc00283c Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 6 Aug 2014 16:09:25 -0700 Subject: lib/glob.c: add CONFIG_GLOB_SELFTEST This was useful during development, and is retained for future regression testing. GCC appears to have no way to place string literals in a particular section; adding __initconst to a char pointer leaves the string itself in the default string section, where it will not be thrown away after module load. Thus all string constants are kept in explicitly declared and named arrays. Sorry this makes printk a bit harder to read. At least the tests are more compact. Signed-off-by: George Spelvin Cc: Randy Dunlap Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 14 ++++++ lib/glob.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) diff --git a/lib/Kconfig b/lib/Kconfig index 41bfeec72e40..df872659ddd3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -415,6 +415,20 @@ config GLOB are compiling an out-of tree driver which tells you that it depends on this. +config GLOB_SELFTEST + bool "glob self-test on init" + default n + depends on GLOB + help + This option enables a simple self-test of the glob_match + function on startup. It is primarily useful for people + working on the code to ensure they haven't introduced any + regressions. + + It only adds a little bit of code and slows kernel boot (or + module load) by a small amount, so you're welcome to play with + it, but you probably don't need it. + # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/glob.c b/lib/glob.c index 0ba3ea86b546..500fc80d23e1 100644 --- a/lib/glob.c +++ b/lib/glob.c @@ -121,3 +121,167 @@ backtrack: } } EXPORT_SYMBOL(glob_match); + + +#ifdef CONFIG_GLOB_SELFTEST + +#include +#include + +/* Boot with "glob.verbose=1" to show successful tests, too */ +static bool verbose = false; +module_param(verbose, bool, 0); + +struct glob_test { + char const *pat, *str; + bool expected; +}; + +static bool __pure __init test(char const *pat, char const *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* Can't get string literals into a particular section, so... */ + static char const msg_error[] __initconst = + KERN_ERR "glob: \"%s\" vs. \"%s\": %s *** ERROR ***\n"; + static char const msg_ok[] __initconst = + KERN_DEBUG "glob: \"%s\" vs. \"%s\": %s OK\n"; + static char const mismatch[] __initconst = "mismatch"; + char const *message; + + if (!success) + message = msg_error; + else if (verbose) + message = msg_ok; + else + return success; + + printk(message, pat, str, mismatch + 3*match); + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static char const glob_tests[] __initconst = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +static int __init glob_init(void) +{ + unsigned successes = 0; + unsigned n = 0; + char const *p = glob_tests; + static char const message[] __initconst = + KERN_INFO "glob: %u self-tests passed, %u failed\n"; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const *pat = p; + + p += strlen(p) + 1; + successes += test(pat, p, expected); + p += strlen(p) + 1; + n++; + } + + n -= successes; + printk(message, successes, n); + + /* What's the errno for "kernel bug detected"? Guess... */ + return n ? -ECANCELED : 0; +} + +/* We need a dummy exit function to allow unload */ +static void __exit glob_fini(void) { } + +module_init(glob_init); +module_exit(glob_fini); + +#endif /* CONFIG_GLOB_SELFTEST */ -- cgit v1.2.3 From 428ac5fc056e06dc0b4ed82d5979add9a8c62b35 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 6 Aug 2014 16:09:27 -0700 Subject: libata: Use glob_match from lib/glob.c The function may be useful for other drivers, so export it. (Suggested by Tejun Heo.) Note that I inverted the return value of glob_match; returning true on match seemed to make more sense. Signed-off-by: George Spelvin Cc: Randy Dunlap Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/ata/Kconfig | 1 + drivers/ata/libata-core.c | 72 ++--------------------------------------------- 2 files changed, 4 insertions(+), 69 deletions(-) diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index e65d400efd44..e1b92788c225 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -16,6 +16,7 @@ menuconfig ATA depends on BLOCK depends on !(M32R || M68K || S390) || BROKEN select SCSI + select GLOB ---help--- If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or any other ATA device under Linux, say Y and make sure that you know diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 677c0c1b03bd..dbdc5d32343f 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -4250,73 +4251,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { } }; -/** - * glob_match - match a text string against a glob-style pattern - * @text: the string to be examined - * @pattern: the glob-style pattern to be matched against - * - * Either/both of text and pattern can be empty strings. - * - * Match text against a glob-style pattern, with wildcards and simple sets: - * - * ? matches any single character. - * * matches any run of characters. - * [xyz] matches a single character from the set: x, y, or z. - * [a-d] matches a single character from the range: a, b, c, or d. - * [a-d0-9] matches a single character from either range. - * - * The special characters ?, [, -, or *, can be matched using a set, eg. [*] - * Behaviour with malformed patterns is undefined, though generally reasonable. - * - * Sample patterns: "SD1?", "SD1[0-5]", "*R0", "SD*1?[012]*xx" - * - * This function uses one level of recursion per '*' in pattern. - * Since it calls _nothing_ else, and has _no_ explicit local variables, - * this will not cause stack problems for any reasonable use here. - * - * RETURNS: - * 0 on match, 1 otherwise. - */ -static int glob_match (const char *text, const char *pattern) -{ - do { - /* Match single character or a '?' wildcard */ - if (*text == *pattern || *pattern == '?') { - if (!*pattern++) - return 0; /* End of both strings: match */ - } else { - /* Match single char against a '[' bracketed ']' pattern set */ - if (!*text || *pattern != '[') - break; /* Not a pattern set */ - while (*++pattern && *pattern != ']' && *text != *pattern) { - if (*pattern == '-' && *(pattern - 1) != '[') - if (*text > *(pattern - 1) && *text < *(pattern + 1)) { - ++pattern; - break; - } - } - if (!*pattern || *pattern == ']') - return 1; /* No match */ - while (*pattern && *pattern++ != ']'); - } - } while (*++text && *pattern); - - /* Match any run of chars against a '*' wildcard */ - if (*pattern == '*') { - if (!*++pattern) - return 0; /* Match: avoid recursion at end of pattern */ - /* Loop to handle additional pattern chars after the wildcard */ - while (*text) { - if (glob_match(text, pattern) == 0) - return 0; /* Remainder matched */ - ++text; /* Absorb (match) this char and try again */ - } - } - if (!*text && !*pattern) - return 0; /* End of both strings: match */ - return 1; /* No match */ -} - static unsigned long ata_dev_blacklisted(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; @@ -4327,10 +4261,10 @@ static unsigned long ata_dev_blacklisted(const struct ata_device *dev) ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev)); while (ad->model_num) { - if (!glob_match(model_num, ad->model_num)) { + if (glob_match(model_num, ad->model_num)) { if (ad->model_rev == NULL) return ad->horkage; - if (!glob_match(model_rev, ad->model_rev)) + if (glob_match(model_rev, ad->model_rev)) return ad->horkage; } ad++; -- cgit v1.2.3 From e004f3c7780de32fa822f292ebadd985bcadb1e0 Mon Sep 17 00:00:00 2001 From: Gui Hecheng Date: Wed, 6 Aug 2014 16:09:29 -0700 Subject: lib/cmdline.c: add size unit t/p/e to memparse For modern filesystems such as btrfs, t/p/e size level operations are common. add size unit t/p/e parsing to memparse Signed-off-by: Gui Hecheng Acked-by: David Rientjes Reviewed-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index d4932f745e92..76a712e6e20e 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -121,11 +121,7 @@ EXPORT_SYMBOL(get_options); * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with %K (for kilobytes, or 1024 bytes), - * %M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or - * 1073741824). If the number is suffixed with K, M, or G, then - * the return value is the number multiplied by one kilobyte, one - * megabyte, or one gigabyte, respectively. + * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) @@ -135,6 +131,15 @@ unsigned long long memparse(const char *ptr, char **retptr) unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; case 'G': case 'g': ret <<= 10; -- cgit v1.2.3 From 142cda5dbcb0dc3738c079f591290d6384261c73 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 6 Aug 2014 16:09:31 -0700 Subject: lib/string_helpers.c: constify static arrays Complement commit 68aecfb97978 ("lib/string_helpers.c: make arrays static") by making the arrays const -- not only pointing to const strings. This moves them out of the data section to the r/o data section: text data bss dec hex filename 1150 176 0 1326 52e lib/string_helpers.old.o 1326 0 0 1326 52e lib/string_helpers.new.o Signed-off-by: Mathias Krause Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string_helpers.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index ed5c1454dd62..29033f319aea 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -25,12 +25,15 @@ int string_get_size(u64 size, const enum string_size_units units, char *buf, int len) { - static const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", - "EB", "ZB", "YB", NULL}; - static const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB", NULL }; - static const char **units_str[] = { - [STRING_UNITS_10] = units_10, + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", + NULL + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { -- cgit v1.2.3 From 129965a916a8b659d7bb3b59877a3e8259bed5e3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:09:33 -0700 Subject: lib/test-kstrtox.c: use ARRAY_SIZE instead of sizeof/sizeof[0] Use kernel.h definition. Signed-off-by: Fabian Frederick Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test-kstrtox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c index bea3f3fa3f02..4137bca5f8e8 100644 --- a/lib/test-kstrtox.c +++ b/lib/test-kstrtox.c @@ -3,7 +3,7 @@ #include #define for_each_test(i, test) \ - for (i = 0; i < sizeof(test) / sizeof(test[0]); i++) + for (i = 0; i < ARRAY_SIZE(test); i++) struct test_fail { const char *str; -- cgit v1.2.3 From 087face5265026d4fe664bdb580f4904bd10cfbf Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:09:36 -0700 Subject: kernel.h: remove deprecated pack_hex_byte It's been nearly 3 years now since commit 55036ba76b2d ("lib: rename pack_hex_byte() to hex_byte_pack()") so it's time to remove this deprecated and unused static inline. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a9e2268ecccb..3dc22abbc68a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -493,11 +493,6 @@ static inline char *hex_byte_pack_upper(char *buf, u8 byte) return buf; } -static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) -{ - return hex_byte_pack(buf, byte); -} - extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); -- cgit v1.2.3 From 27d555d101c820ac4b1962680bd0192993c6e4e0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:38 -0700 Subject: lib: list_sort_test(): return -ENOMEM when allocation fails Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 1183fa70a44d..291412ade89a 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -207,7 +207,7 @@ static int __init cmp(void *priv, struct list_head *a, struct list_head *b) static int __init list_sort_test(void) { - int i, count = 1, err = -EINVAL; + int i, count = 1, err = -ENOMEM; struct debug_el *el; struct list_head *cur, *tmp; LIST_HEAD(head); @@ -239,6 +239,7 @@ static int __init list_sort_test(void) list_sort(NULL, &head, cmp); + err = -EINVAL; for (cur = head.next; cur->next != &head; cur = cur->next) { struct debug_el *el1; int cmp_result; -- cgit v1.2.3 From 9d418dcc6d15539a9567b2ad7fe7473648989f44 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:40 -0700 Subject: lib: list_sort_test(): add extra corruption check Add a check to make sure that the prev pointer of the list head points to the last element on the list. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/list_sort.c b/lib/list_sort.c index 291412ade89a..fbdbc867b252 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -272,6 +272,11 @@ static int __init list_sort_test(void) } count++; } + if (head.prev != cur) { + printk(KERN_ERR "list_sort_test: error: list is corrupted\n"); + goto exit; + } + if (count != TEST_LIST_LEN) { printk(KERN_ERR "list_sort_test: error: bad list length %d", -- cgit v1.2.3 From 694123031d12458a343492528fa40113e5ec843e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:42 -0700 Subject: lib: list_sort_test(): simplify and harden cleanup There is no reason to maintain the list structure while freeing the debug elements. Aside from the redundant pointer manipulations, it is also inefficient from a locality-of-reference viewpoint, since they are visited in a random order (wrt. the order they were allocated). Furthermore, if we jumped to exit: after detecting list corruption, it is actually dangerous. So just free the elements in the order they were allocated, using the backing array elts. Allocate that using kcalloc(), so that if allocation of one of the debug element fails, we just end up calling kfree(NULL) for the trailing elements. Minor details: Use sizeof(*elts) instead of sizeof(void *), and return err immediately when allocation of elts fails, to avoid introducing another label just before the final return statement. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index fbdbc867b252..a34c78c30d56 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -209,16 +209,16 @@ static int __init list_sort_test(void) { int i, count = 1, err = -ENOMEM; struct debug_el *el; - struct list_head *cur, *tmp; + struct list_head *cur; LIST_HEAD(head); printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); - elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { printk(KERN_ERR "list_sort_test: error: cannot allocate " "memory\n"); - goto exit; + return err; } for (i = 0; i < TEST_LIST_LEN; i++) { @@ -286,11 +286,9 @@ static int __init list_sort_test(void) err = 0; exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); kfree(elts); - list_for_each_safe(cur, tmp, &head) { - list_del(cur); - kfree(container_of(cur, struct debug_el, list)); - } return err; } module_init(list_sort_test); -- cgit v1.2.3 From 61b3d6c48f059bb054b0019088736dab6c2ac0ec Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:44 -0700 Subject: lib: list_sort.c: Limit number of unused cmp callbacks The helper merge_and_restore_back_links() makes sure to call the caller's cmp function during the final ->prev pointer fixup, so that the cmp function may call cond_resched(). However, if the cmp function does not call cond_resched() at all, this is entirely redundant. If it does, doing at least two function calls for every two pointer assignments is a bit excessive. This patch limits the calls to once for every 256 iterations. Signed-off-by: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index a34c78c30d56..6b9fdaf1d32e 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -47,6 +47,7 @@ static void merge_and_restore_back_links(void *priv, struct list_head *a, struct list_head *b) { struct list_head *tail = head; + u8 count = 0; while (a && b) { /* if equal, take 'a' -- important for sort stability */ @@ -70,7 +71,8 @@ static void merge_and_restore_back_links(void *priv, * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - (*cmp)(priv, tail->next, tail->next); + if (unlikely(!(++count))) + (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; -- cgit v1.2.3 From d0da23b0debcef135c866cc8117d197fb40a6079 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Aug 2014 16:09:46 -0700 Subject: lib/list_sort.c: convert to pr_foo Cc: Rasmus Villemoes Cc: Artem Bityutskiy Cc: Don Mullis Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index 6b9fdaf1d32e..12bcba1c8612 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,3 +1,6 @@ + +#define pr_fmt(fmt) "list_sort_test: " fmt + #include #include #include @@ -125,9 +128,7 @@ void list_sort(void *priv, struct list_head *head, } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list passed to" - " list_sort() too long for" - " efficiency\n"); + printk_once(KERN_DEBUG "list too long for efficiency\n"); lev--; } max_lev = lev; @@ -170,27 +171,25 @@ static struct debug_el **elts __initdata; static int __init check(struct debug_el *ela, struct debug_el *elb) { if (ela->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - ela->serial); + pr_err("error: incorrect serial %d\n", ela->serial); return -EINVAL; } if (elb->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - elb->serial); + pr_err("error: incorrect serial %d\n", elb->serial); return -EINVAL; } if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - printk(KERN_ERR "list_sort_test: error: phantom element\n"); + pr_err("error: phantom element\n"); return -EINVAL; } if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); return -EINVAL; } if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); return -EINVAL; } return 0; @@ -214,20 +213,18 @@ static int __init list_sort_test(void) struct list_head *cur; LIST_HEAD(head); - printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + pr_debug("start testing list_sort()\n"); elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { - printk(KERN_ERR "list_sort_test: error: cannot allocate " - "memory\n"); + pr_err("error: cannot allocate memory\n"); return err; } for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { - printk(KERN_ERR "list_sort_test: error: cannot " - "allocate memory\n"); + pr_err("error: cannot allocate memory\n"); goto exit; } /* force some equivalencies */ @@ -247,42 +244,38 @@ static int __init list_sort_test(void) int cmp_result; if (cur->next->prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is " - "corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } cmp_result = cmp(NULL, cur, cur->next); if (cmp_result > 0) { - printk(KERN_ERR "list_sort_test: error: list is not " - "sorted\n"); + pr_err("error: list is not sorted\n"); goto exit; } el = container_of(cur, struct debug_el, list); el1 = container_of(cur->next, struct debug_el, list); if (cmp_result == 0 && el->serial >= el1->serial) { - printk(KERN_ERR "list_sort_test: error: order of " - "equivalent elements not preserved\n"); + pr_err("error: order of equivalent elements not " + "preserved\n"); goto exit; } if (check(el, el1)) { - printk(KERN_ERR "list_sort_test: error: element check " - "failed\n"); + pr_err("error: element check failed\n"); goto exit; } count++; } if (head.prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } if (count != TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: bad list length %d", - count); + pr_err("error: bad list length %d", count); goto exit; } -- cgit v1.2.3 From 0679cc483669d08153d158273455398a389ee9ca Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:49 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_empty unsigned Many functions in lib/bitmap.c start with an expression such as lim = bits/BITS_PER_LONG. Since bits has type (signed) int, and since gcc cannot know that it is in fact non-negative, it generates worse code than it could. These patches, mostly consisting of changing various parameters to unsigned, gives a slight overall code reduction: add/remove: 1/1 grow/shrink: 8/16 up/down: 251/-414 (-163) function old new delta tick_device_uses_broadcast 335 425 +90 __irq_alloc_descs 498 554 +56 __bitmap_andnot 73 115 +42 __bitmap_and 70 101 +31 bitmap_weight - 11 +11 copy_hugetlb_page_range 752 762 +10 follow_hugetlb_page 846 854 +8 hugetlb_init 1415 1417 +2 hugetlb_nrpages_setup 130 131 +1 hugetlb_add_hstate 377 376 -1 bitmap_allocate_region 82 80 -2 select_task_rq_fair 2202 2191 -11 hweight_long 66 55 -11 __reg_op 230 219 -11 dm_stats_message 2849 2833 -16 bitmap_parselist 92 74 -18 __bitmap_weight 115 97 -18 __bitmap_subset 153 129 -24 __bitmap_full 128 104 -24 __bitmap_empty 120 96 -24 bitmap_set 179 149 -30 bitmap_clear 185 155 -30 __bitmap_equal 136 105 -31 __bitmap_intersects 148 108 -40 __bitmap_complement 109 67 -42 tick_device_setup_broadcast_func.isra 81 - -81 [The increases in __bitmap_and{,not} are due to bug fixes 17/18,18/18. No idea why bitmap_weight suddenly appears.] While 163 bytes treewide is insignificant, I believe the bitmap functions are often called with locks held, so saving even a few cycles might be worth it. While making these changes, I found a few other things that might be worth including. 16,17,18 are actual bug fixes. The rest shouldn't change the behaviour of any of the functions, provided no-one passed negative nbits values. If something should come up, it should be fairly bisectable. A few issues I thought about, but didn't know what to do with: * Many of the functions misbehave if nbits is compile-time 0; the out-of-line functions generally handle 0 correctly. bitmap_fill() is particularly bad, whether the 0 is known at compile time or not. It would probably be nice to add detection of at least compile-time 0 and handle that appropriately. * I didn't change __bitmap_shift_{left,right} to use unsigned because I want to fully understand why the algorithm works before making that change. However, AFAICT, they behave correctly for all (positive) shift amounts. This is not the case for the small_const_nbits versions. If for example nbits = n = BITS_PER_LONG, the shift operators turn into no-ops (at least on x86), so one get *dst = *src, whereas one would expect to get *dst=0. That difference in behaviour is somewhat annoying. This patch (of 18): The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7ad634501e48..3d3fd6b2f157 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -88,7 +88,7 @@ * lib/bitmap.c provides these functions: */ -extern int __bitmap_empty(const unsigned long *bitmap, int bits); +extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_full(const unsigned long *bitmap, int bits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); @@ -257,7 +257,7 @@ static inline int bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline int bitmap_empty(const unsigned long *src, int nbits) +static inline int bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index 06f7e4fe8d2d..378911001442 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -40,9 +40,9 @@ * for the best explanations of this ordering. */ -int __bitmap_empty(const unsigned long *bitmap, int bits) +int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap[k]) return 0; -- cgit v1.2.3 From 8397927c8045c58afc68ef839855eb5505259df3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:51 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_full unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3d3fd6b2f157..bc7e520d3f78 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -89,7 +89,7 @@ */ extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); -extern int __bitmap_full(const unsigned long *bitmap, int bits); +extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, @@ -265,7 +265,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits) return __bitmap_empty(src, nbits); } -static inline int bitmap_full(const unsigned long *src, int nbits) +static inline int bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index 378911001442..9859f38660f9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -55,9 +55,9 @@ int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_empty); -int __bitmap_full(const unsigned long *bitmap, int bits) +int __bitmap_full(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (~bitmap[k]) return 0; -- cgit v1.2.3 From 5e068069319a9fb02fb14337c2cedeae5f16d812 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:53 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_equal unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index bc7e520d3f78..1e0f46c91125 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -91,7 +91,7 @@ extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits); extern void __bitmap_shift_right(unsigned long *dst, diff --git a/lib/bitmap.c b/lib/bitmap.c index 9859f38660f9..d6bb955e71cb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -71,9 +71,9 @@ int __bitmap_full(const unsigned long *bitmap, unsigned int bits) EXPORT_SYMBOL(__bitmap_full); int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return 0; -- cgit v1.2.3 From 3d6684f4e6a46f3a8263f5681e093bccbb767a1c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:55 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_complement unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 6 +++--- lib/bitmap.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 1e0f46c91125..21fb52ffe444 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -93,7 +93,7 @@ extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - int bits); + unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, @@ -222,7 +222,7 @@ static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); @@ -231,7 +231,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr } static inline int bitmap_equal(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index d6bb955e71cb..0f2f845702eb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -86,9 +86,9 @@ int __bitmap_equal(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_equal); -void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) +void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) dst[k] = ~src[k]; -- cgit v1.2.3 From 65b4ee62c9cd10640f0054f47fd84c7920e8c118 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:57 -0700 Subject: lib: bitmap: remove unnecessary mask from bitmap_complement Since the extra bits are "don't care", there is no reason to mask the last word to the used bits when complementing. This shaves off yet a few bytes. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 21fb52ffe444..f42d72d5fe82 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -225,7 +225,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr unsigned int nbits) { if (small_const_nbits(nbits)) - *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); + *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index 0f2f845702eb..4387e3c092fd 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -93,7 +93,7 @@ void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned dst[k] = ~src[k]; if (bits % BITS_PER_LONG) - dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); + dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); -- cgit v1.2.3 From 2f9305eb31097fdd3dc86daca65d8097d1fcf2ff Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:09:59 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_{and,or,xor,andnot} unsigned This change is only for consistency with the changes to the other bitmap_* functions; it doesn't change the size of the generated code: inside BITS_TO_LONGS there is a sizeof(long), which causes bits to be interpreted as unsigned anyway. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 16 ++++++++-------- lib/bitmap.c | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index f42d72d5fe82..7048782fe5b9 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -99,13 +99,13 @@ extern void __bitmap_shift_right(unsigned long *dst, extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_subset(const unsigned long *bitmap1, @@ -188,7 +188,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2) != 0; @@ -196,7 +196,7 @@ static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -205,7 +205,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; @@ -214,7 +214,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, } static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2)) != 0; diff --git a/lib/bitmap.c b/lib/bitmap.c index 4387e3c092fd..03207373cc5a 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -182,10 +182,10 @@ void __bitmap_shift_left(unsigned long *dst, EXPORT_SYMBOL(__bitmap_shift_left); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); unsigned long result = 0; for (k = 0; k < nr; k++) @@ -195,10 +195,10 @@ int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; @@ -206,10 +206,10 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; @@ -217,10 +217,10 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_xor); int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); unsigned long result = 0; for (k = 0; k < nr; k++) -- cgit v1.2.3 From 6dfe9799c2a03d225316a3e959b0447f3f50303e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:01 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_intersects unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7048782fe5b9..2f3f3a4d5996 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -107,7 +107,7 @@ extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); @@ -240,7 +240,7 @@ static inline int bitmap_equal(const unsigned long *src1, } static inline int bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; diff --git a/lib/bitmap.c b/lib/bitmap.c index 03207373cc5a..e85daa90b237 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -230,9 +230,9 @@ int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_andnot); int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return 1; -- cgit v1.2.3 From 5be20213e855550de2b32fde6fc116f74bab86a6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:03 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_subset unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2f3f3a4d5996..87e88f79def1 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -109,7 +109,7 @@ extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); extern void bitmap_set(unsigned long *map, int i, int len); @@ -249,7 +249,7 @@ static inline int bitmap_intersects(const unsigned long *src1, } static inline int bitmap_subset(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index e85daa90b237..c9bff5379795 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -245,9 +245,9 @@ int __bitmap_intersects(const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_intersects); int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return 0; -- cgit v1.2.3 From 877d9f3b63ac2e5dbc51cbcdff156433f03b3a32 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:05 -0700 Subject: lib: bitmap: make nbits parameter of bitmap_weight unsigned The compiler can generate slightly smaller and simpler code when it knows that "nbits" is non-negative. Since no-one passes a negative bit-count, this shouldn't affect the semantics. I didn't change the return type, since that might change the semantics of some expression containing a call to bitmap_weight(). Certainly an int is capable of holding the result. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 87e88f79def1..64b0ebe9f9a8 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -110,7 +110,7 @@ extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_weight(const unsigned long *bitmap, int bits); +extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); extern void bitmap_set(unsigned long *map, int i, int len); extern void bitmap_clear(unsigned long *map, int start, int nr); @@ -273,7 +273,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits) return __bitmap_full(src, nbits); } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index c9bff5379795..f69435c23f9c 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -259,9 +259,10 @@ int __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); -int __bitmap_weight(const unsigned long *bitmap, int bits) +int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - int k, w = 0, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; + int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); -- cgit v1.2.3 From fb5ac54263ef3fcb5c469a61e0ab6b06e45e2307 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:07 -0700 Subject: lib: bitmap: make the start index of bitmap_set unsigned The compiler can generate slightly smaller and simpler code when it knows that "start" is non-negative. Also, use the names "start" and "len" for the two parameters in both header file and implementation, instead of the previous mix. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 64b0ebe9f9a8..ad2c67d3583e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -112,7 +112,7 @@ extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void bitmap_set(unsigned long *map, int i, int len); +extern void bitmap_set(unsigned long *map, unsigned int start, int len); extern void bitmap_clear(unsigned long *map, int start, int nr); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, diff --git a/lib/bitmap.c b/lib/bitmap.c index f69435c23f9c..2a3a92fc3355 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -274,21 +274,21 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, int start, int nr) +void bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_set >= 0) { + while (len - bits_to_set >= 0) { *p |= mask_to_set; - nr -= bits_to_set; + len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } - if (nr) { + if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } -- cgit v1.2.3 From 154f5e38f30f262025c8c2e825376f6eb51e8bcb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:10 -0700 Subject: lib: bitmap: make the start index of bitmap_clear unsigned The compiler can generate slightly smaller and simpler code when it knows that "start" is non-negative. Also, use the names "start" and "len" for the two parameters for consistency with bitmap_set. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ad2c67d3583e..83c1c7d25073 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -113,7 +113,7 @@ extern int __bitmap_subset(const unsigned long *bitmap1, extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); extern void bitmap_set(unsigned long *map, unsigned int start, int len); -extern void bitmap_clear(unsigned long *map, int start, int nr); +extern void bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, diff --git a/lib/bitmap.c b/lib/bitmap.c index 2a3a92fc3355..5d2540396300 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -295,21 +295,21 @@ void bitmap_set(unsigned long *map, unsigned int start, int len) } EXPORT_SYMBOL(bitmap_set); -void bitmap_clear(unsigned long *map, int start, int nr) +void bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_clear >= 0) { + while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; - nr -= bits_to_clear; + len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } - if (nr) { + if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } -- cgit v1.2.3 From bc5be1828065681300f3727ab55fac0d7ef37af3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:12 -0700 Subject: lib: bitmap: simplify bitmap_parselist We want len to be the index of the first '\n', or the length of the string if there is no newline. This is a good example of the usefulness of strchrnul(). Use that instead, thus eliminating a branch and a call to strlen(). Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 5d2540396300..d4b3a6d4b2d7 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -665,13 +665,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) { - char *nl = strchr(bp, '\n'); - int len; - - if (nl) - len = nl - bp; - else - len = strlen(bp); + char *nl = strchrnul(bp, '\n'); + int len = nl - bp; return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); } -- cgit v1.2.3 From a855174878f06d197ddfde5cbe1e69a1d4a2df27 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:14 -0700 Subject: lib: bitmap: fix typo in kerneldoc for bitmap_pos_to_ord A few lines above, it was stated that positions for non-set bits are mapped to -1, which is obviously also what the code does. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index d4b3a6d4b2d7..2714df9f5cdb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -712,7 +712,7 @@ EXPORT_SYMBOL(bitmap_parselist_user); * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, - * and other @pos values will get mapped to 0. When @pos value 7 + * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * -- cgit v1.2.3 From 9279d3286e10736766edcaf815ae10e00856e448 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:16 -0700 Subject: lib: bitmap: change parameter of bitmap_*_region to unsigned Changing the pos parameter of __reg_op to unsigned allows the compiler to generate slightly smaller and simpler code. Also update its callers bitmap_*_region to receive and pass unsigned int. The return types of bitmap_find_free_region and bitmap_allocate_region are still int to allow a negative error code to be returned. An int is certainly capable of representing any realistic return value. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 6 +++--- lib/bitmap.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 83c1c7d25073..210037833356 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -140,9 +140,9 @@ extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, int sz, int bits); -extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); +extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); diff --git a/lib/bitmap.c b/lib/bitmap.c index 2714df9f5cdb..c2f3807b3601 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1042,7 +1042,7 @@ enum { REG_OP_RELEASE, /* clear all bits in region */ }; -static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op) +static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op) { int nbits_reg; /* number of bits in region */ int index; /* index first long of region in bitmap */ @@ -1108,11 +1108,11 @@ done: * Return the bit offset in bitmap of the allocated region, * or -errno on failure. */ -int bitmap_find_free_region(unsigned long *bitmap, int bits, int order) +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { - int pos, end; /* scans bitmap by regions of size order */ + unsigned int pos, end; /* scans bitmap by regions of size order */ - for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { + for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) continue; __reg_op(bitmap, pos, order, REG_OP_ALLOC); @@ -1133,7 +1133,7 @@ EXPORT_SYMBOL(bitmap_find_free_region); * * No return value. */ -void bitmap_release_region(unsigned long *bitmap, int pos, int order) +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { __reg_op(bitmap, pos, order, REG_OP_RELEASE); } @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(bitmap_release_region); * Return 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ -int bitmap_allocate_region(unsigned long *bitmap, int pos, int order) +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; -- cgit v1.2.3 From 2ac521d3325a24f01c1f1e86c74537b831d282c5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:18 -0700 Subject: lib: bitmap: micro-optimize bitmap_allocate_region __reg_op(..., REG_OP_ALLOC) always returns 0, so we might as well use that and save an instruction. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index c2f3807b3601..faaf7206d4cf 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1154,8 +1154,7 @@ int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; - __reg_op(bitmap, pos, order, REG_OP_ALLOC); - return 0; + return __reg_op(bitmap, pos, order, REG_OP_ALLOC); } EXPORT_SYMBOL(bitmap_allocate_region); -- cgit v1.2.3 From c5341ec8904ebff50f365a2626da6ab525d63b9e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:20 -0700 Subject: lib: bitmap: add missing mask in bitmap_shift_right There is no guarantee that *src does not contain garbage bits outside the lower nbits, so we need to mask it before the shift-and-assign. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 210037833356..75df61d9ecfb 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -284,7 +284,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, int n, int nbits) { if (small_const_nbits(nbits)) - *dst = *src >> n; + *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n; else __bitmap_shift_right(dst, src, n, nbits); } -- cgit v1.2.3 From 7e5f97d1927f41affa21aa5b321865ceab1994ce Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:22 -0700 Subject: lib: bitmap: add missing mask in bitmap_and Apparently, bitmap_and is supposed to return whether the new bitmap is empty. But it didn't take potential garbage bits in the last word into account. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 75df61d9ecfb..3399a9ecd991 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -191,7 +191,7 @@ static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & *src2) != 0; + return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index faaf7206d4cf..ce2ec80bf431 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -185,11 +185,14 @@ int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; - unsigned int nr = BITS_TO_LONGS(bits); + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); -- cgit v1.2.3 From 74e765319084bd2940a9612ada961f0f7385936c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:24 -0700 Subject: lib: bitmap: add missing mask in bitmap_andnot Apparently, bitmap_andnot is supposed to return whether the new bitmap is empty. But it didn't take potential garbage bits in the last word into account. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 2 +- lib/bitmap.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3399a9ecd991..e1c8d080c427 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -217,7 +217,7 @@ static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & ~(*src2)) != 0; + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index ce2ec80bf431..1e031f2c9aba 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -223,11 +223,14 @@ int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; - unsigned int nr = BITS_TO_LONGS(bits); + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); -- cgit v1.2.3 From ebfdc40969f24fc0cdd1349835d36e8ebae05374 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:27 -0700 Subject: checkpatch: attempt to find unnecessary 'out of memory' messages Logging messages that show some type of "out of memory" error are generally unnecessary as there is a generic message and a stack dump done by the memory subsystem. These messages generally increase kernel size without much added value. Emit a warning on these types of messages. This test looks for any inserted message function, then looks at the previous line for an "if (!foo)" or "if (foo == NULL)" test and then looks at the preceding statement for an allocation function like "foo = kmalloc()" ie: this code matches: foo = kmalloc(); if (foo == NULL) { printk("Out of memory\n"); return -ENOMEM; } This test is very crude and incomplete. This test can miss quite a lot of of OOM messages that do not have this specific form. ie: this code does not match: foo = kmalloc(); if (!foo) { rtn = -ENOMEM; printk("Out of memory!\n"); goto out; } This test could also be a false positive when the logging message itself does not specify anything about memory, but I did not find any false positives in my limited testing. spatch could be a better solution but correctness seems non-trivial for that tool too. Signed-off-by: Joe Perches Acked-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 182be0f12407..ac1d6b0a6f09 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4014,6 +4014,23 @@ sub process { } } +# check for unnecessary "Out of Memory" messages + if ($line =~ /^\+.*\b$logFunctions\s*\(/ && + $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && + (defined $1 || defined $3) && + $linenr > 3) { + my $testval = $2; + my $testline = $lines[$linenr - 3]; + + my ($s, $c) = ctx_statement_block($linenr - 3, $realcnt, 0); +# print("line: <$line>\nprevline: <$prevline>\ns: <$s>\nc: <$c>\n\n\n"); + + if ($c =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|(?:dev_)?alloc_skb)/) { + WARN("OOM_MESSAGE", + "Possible unnecessary 'out of memory' message\n" . $hereprev); + } + } + # check for bad placement of section $InitAttribute (e.g.: __initdata) if ($line =~ /(\b$InitAttribute\b)/) { my $attr = $1; -- cgit v1.2.3 From 032a4c0f9a77ce565355c6e191553e853ba66f09 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:29 -0700 Subject: checkpatch: warn on unnecessary else after return or break Using an else following a break or return can unnecessarily indent code blocks. ie: for (i = 0; i < 100; i++) { int foo = bar(); if (foo < 1) break; else usleep(1); } is generally better written as: for (i = 0; i < 100; i++) { int foo = bar(); if (foo < 1) break; usleep(1); } Warn when a bare else statement is preceded by a break or return indented 1 tab more than the else. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ac1d6b0a6f09..d833b737a295 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2342,6 +2342,16 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check indentation of any line with a bare else +# if the previous line is a break or return and is indented 1 tab more... + if ($sline =~ /^\+([\t]+)(?:}[ \t]*)?else(?:[ \t]*{)?\s*$/) { + my $tabs = length($1) + 1; + if ($prevline =~ /^\+\t{$tabs,$tabs}(?:break|return)\b/) { + WARN("UNNECESSARY_ELSE", + "else is not generally useful after a break or return\n" . $hereprev); + } + } + # discourage the addition of CONFIG_EXPERIMENTAL in #if(def). if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { WARN("CONFIG_EXPERIMENTAL", -- cgit v1.2.3 From 356fd398135480363402cd334ac3218be56b7201 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:31 -0700 Subject: checkpatch: fix complex macro false positive for escaped constant char A single escaped constant char is not a complex macro. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d833b737a295..cadf70f0201a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3772,7 +3772,7 @@ sub process { $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); $dstat !~ /^[!~-]?(?:$Lval|$Constant)$/ && # 10 // foo() // !foo // ~foo // -foo // foo->bar // foo.bar->baz - $dstat !~ /^'X'$/ && # character constants + $dstat !~ /^'X'$/ && $dstat !~ /^'XX'$/ && # character constants $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo -- cgit v1.2.3 From 5a4e1fd37d0677755391e2e89e9386e072265157 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:33 -0700 Subject: checkpatch: fix function pointers in blank line needed after declarations test Add a function pointer declaration check to the test for blank line needed after declarations. Signed-off-by: Joe Perches Reported-by: Bruce W Allan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index cadf70f0201a..538105ae88b3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2295,6 +2295,8 @@ sub process { if ($sline =~ /^\+\s+\S/ && #Not at char 1 # actual declarations ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $prevline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros @@ -2307,6 +2309,8 @@ sub process { $prevline =~ /(?:\{\s*|\\)$/) && # looks like a declaration !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $sline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros -- cgit v1.2.3 From 29ee1b0c67e0dd7dea8dd718e8326076bce5b6fe Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:35 -0700 Subject: checkpatch: ignore email headers better There are some patches created by git format-patch that when scanned by checkpatch report errors on lines like To: address.tld This is a checkpatch false positive. Improve the logic a bit to ignore folded email headers to avoid emitting these messages. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 538105ae88b3..1ec68083a929 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1637,7 +1637,7 @@ sub process { my $signoff = 0; my $is_patch = 0; - my $in_header_lines = 1; + my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch my $non_utf8_charset = 0; @@ -1993,7 +1993,8 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { + !($rawline =~ /^\s+\S/ || + $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) { $in_header_lines = 0; $in_commit_log = 1; } -- cgit v1.2.3 From 048b123fad06f33169caa4afd6d56d58c31517e5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 6 Aug 2014 16:10:37 -0700 Subject: checkpatch.pl: also suggest 'else if' when if follows brace This might help a kernel hacker think twice before blindly adding a newline. Signed-off-by: Rasmus Villemoes Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1ec68083a929..c40ba40cef43 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3621,7 +3621,7 @@ sub process { # if should not continue a brace if ($line =~ /}\s*if\b/) { ERROR("TRAILING_STATEMENTS", - "trailing statements should be on next line\n" . + "trailing statements should be on next line (or did you mean 'else if'?)\n" . $herecurr); } # case and default should not have general statements after them -- cgit v1.2.3 From 7f61919144ca69ea29f29c3e60c5b7dbd2070aa6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:39 -0700 Subject: checkpatch: add test for blank lines after function/struct/union/enum Add a --strict test asking for a blank line after function/struct/union/enum declarations. Allow exceptions for several attributes and macro uses. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c40ba40cef43..9e4ba9fa9bc8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2292,6 +2292,22 @@ sub process { "networking block comments put the trailing */ on a separate line\n" . $herecurr); } +# check for missing blank lines after struct/union declarations +# with exceptions for various attributes and macros + if ($prevline =~ /^[\+ ]};?\s*$/ && + $line =~ /^\+/ && + !($line =~ /^\+\s*$/ || + $line =~ /^\+\s*EXPORT_SYMBOL/ || + $line =~ /^\+\s*MODULE_/i || + $line =~ /^\+\s*\#\s*(?:end|elif|else)/ || + $line =~ /^\+[a-z_]*init/ || + $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ || + $line =~ /^\+\s*DECLARE/ || + $line =~ /^\+\s*__setup/)) { + CHK("LINE_SPACING", + "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev); + } + # check for missing blank lines after declarations if ($sline =~ /^\+\s+\S/ && #Not at char 1 # actual declarations -- cgit v1.2.3 From 365dd4eaafa22d2c79913d5f057d636e8842c470 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:42 -0700 Subject: checkpatch: add a multiple blank lines test Multiple consecutive blank lines waste screen space. Emit a --strict only message with these blank lines. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9e4ba9fa9bc8..fad2116f51a4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1642,6 +1642,8 @@ sub process { my $non_utf8_charset = 0; + my $last_blank_line = 0; + our @report = (); our $cnt_lines = 0; our $cnt_error = 0; @@ -2308,6 +2310,15 @@ sub process { "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev); } +# check for multiple consecutive blank lines + if ($prevline =~ /^[\+ ]\s*$/ && + $line =~ /^\+\s*$/ && + $last_blank_line != ($linenr - 1)) { + CHK("LINE_SPACING", + "Please don't use multiple blank lines\n" . $hereprev); + $last_blank_line = $linenr; + } + # check for missing blank lines after declarations if ($sline =~ /^\+\s+\S/ && #Not at char 1 # actual declarations -- cgit v1.2.3 From fee0aa83d43ee65648778d48d47975c323fa0490 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:44 -0700 Subject: checkpatch: change blank line after declaration type to "LINE_SPACING" Make it consistent with the other missing or multiple blank line tests. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index fad2116f51a4..146e9f907280 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2353,7 +2353,7 @@ sub process { $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && # indentation of previous and current line are the same (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - WARN("SPACING", + WARN("LINE_SPACING", "Missing a blank line after declarations\n" . $hereprev); } -- cgit v1.2.3 From 8d73e0e7dc18a39b120cb85ec675d59516e0af1a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:46 -0700 Subject: checkpatch: quiet Kconfig help message checking Editing Kconfig dependencies can emit unnecessary messages about missing or too short help entries. Only emit the message when adding help sections to Kconfig files. Signed-off-by: Joe Perches Reported-by: Jean Delvare Tested-by: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 146e9f907280..df4250a8ad51 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2052,7 +2052,7 @@ sub process { # Only applies when adding the entry originally, after that we do not have # sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /.\s*config\s+/) { + $line =~ /^\+\s*config\s+/) { my $length = 0; my $cnt = $realcnt; my $ln = $linenr + 1; @@ -2065,10 +2065,11 @@ sub process { $is_end = $lines[$ln - 1] =~ /^\+/; next if ($f =~ /^-/); + last if (!$file && $f =~ /^\@\@/); - if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) { + if ($lines[$ln - 1] =~ /^\+\s*(?:bool|tristate)\s*\"/) { $is_start = 1; - } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) { + } elsif ($lines[$ln - 1] =~ /^\+\s*(?:---)?help(?:---)?$/) { $length = -1; } -- cgit v1.2.3 From e2826fd07029e14285c178b41b18f6edc3b15a84 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:48 -0700 Subject: checkpatch: warn on unnecessary parentheses around references of foo->bar Parentheses around &(foo->bar) and *(foo->bar) are unnecessary. Emit a --strict only message on these uses. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index df4250a8ad51..9a89a0609bac 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3491,6 +3491,14 @@ sub process { } } +# check unnecessary parentheses around addressof/dereference single $Lvals +# ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar + + while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { + CHK("UNNECESSARY_PARENTHESES", + "Unnecessary parentheses around $1\n" . $herecurr); + } + #goto labels aren't indented, allow a single space however if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) { -- cgit v1.2.3 From 1574a29f8e769998fe3e55eb6030dc61e3d09ccd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:50 -0700 Subject: checkpatch: allow multiple const * types checkpatch's $Type variable does not match declarations of multiple const * types. This can produce false positives for things like: $ ./scripts/checkpatch.pl -f drivers/staging/comedi/comedidev.h WARNING: Missing a blank line after declarations #60: FILE: drivers/staging/comedi/comedidev.h:60: + const struct comedi_lrange *range_table; + const struct comedi_lrange *const *range_table_list; Fix the $Type variable to support matching multiple "* const" uses. Signed-off-by: Joe Perches Reported-by: Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9a89a0609bac..57a11d7ee841 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -435,7 +435,7 @@ sub build_types { }x; $Type = qr{ $NonptrType - (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; -- cgit v1.2.3 From f27c95db1176b41c6c87d9d3480d15efbca8f3ff Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:52 -0700 Subject: checkpatch: improve "no space after cast" test This --strict test previously worked only for what appeared to be cast to pointer types. Make it work for all casts. Also, there's no reason to show the previous line for this type of message, so don't. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 57a11d7ee841..b351805149a5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2260,12 +2260,12 @@ sub process { } } - if ($line =~ /^\+.*\*[ \t]*\)[ \t]+(?!$Assignment|$Arithmetic)/) { + if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic)/) { if (CHK("SPACING", - "No space is necessary after a cast\n" . $hereprev) && + "No space is necessary after a cast\n" . $herecurr) && $fix) { $fixed[$linenr - 1] =~ - s/^(\+.*\*[ \t]*\))[ \t]+/$1/; + s/(\(\s*$Type\s*\))[ \t]+/$1/; } } -- cgit v1.2.3 From e367455a9f25b11e02b7ea7678a7b146bdd6667e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:55 -0700 Subject: checkpatch: emit fewer kmalloc_array/kcalloc conversion warnings Avoid matching allocs that appear to be known small multiplications of a sizeof with a constant because gcc as of 4.8 cannot optimize the code in a calloc() exactly the same way as an alloc(). Look for numeric constants or what appear to be upper case only macro #defines. Signed-off-by: Joe Perches Reported-by: Theodore Ts'o Original-patch-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b351805149a5..3661ffc7af23 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4487,22 +4487,23 @@ sub process { # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc if ($^V && $^V ge 5.10.0 && - $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) { + $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) { my $oldfunc = $3; my $a1 = $4; my $a2 = $10; my $newfunc = "kmalloc_array"; $newfunc = "kcalloc" if ($oldfunc eq "kzalloc"); - if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) { + my $r1 = $a1; + my $r2 = $a2; + if ($a1 =~ /^sizeof\s*\S/) { + $r1 = $a2; + $r2 = $a1; + } + if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && + !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && $fix) { - my $r1 = $a1; - my $r2 = $a2; - if ($a1 =~ /^sizeof\s*\S/) { - $r1 = $a2; - $r2 = $a1; - } $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; } -- cgit v1.2.3 From d311cd44545f2f69749c68d6723360b4cc21cd02 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:57 -0700 Subject: checkpatch: add test for commit id formatting style in commit log Commit logs have various forms of commit id references. Try to standardize on a 12 character long lower case commit id along with a description of parentheses and the quoted subject line. ie: commit 0123456789ab ("commit description") If git and a git tree exists, look up the commit id and emit the appropriate line as part of the message. Signed-off-by: Joe Perches Requested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3661ffc7af23..496f9abdb930 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -550,6 +550,34 @@ sub seed_camelcase_includes { } } +sub git_commit_info { + my ($commit, $id, $desc) = @_; + + return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); + + my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + $output =~ s/^\s*//gm; + my @lines = split("\n", $output); + + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { +# Maybe one day convert this block of bash into something that returns +# all matching commit ids, but it's very slow... +# +# echo "checking commits $1..." +# git rev-list --remotes | grep -i "^$1" | +# while read line ; do +# git log --format='%H %s' -1 $line | +# echo "commit $(cut -c 1-12,41-)" +# done + } elsif ($lines[0] =~ /^fatal: ambiguous argument '$commit': unknown revision or path not in the working tree\./) { + } else { + $id = substr($lines[0], 0, 12); + $desc = substr($lines[0], 41); + } + + return ($id, $desc); +} + $chk_signoff = 0 if ($file); my @rawlines = (); @@ -674,6 +702,18 @@ sub format_email { return $formatted_email; } +sub which { + my ($bin) = @_; + + foreach my $path (split(/:/, $ENV{PATH})) { + if (-e "$path/$bin") { + return "$path/$bin"; + } + } + + return ""; +} + sub which_conf { my ($conf) = @_; @@ -1958,6 +1998,20 @@ sub process { "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); } +# Check for improperly formed commit descriptions + if ($in_commit_log && + $line =~ /\bcommit\s+[0-9a-f]{5,}/i && + $line !~ /\b[Cc]ommit [0-9a-f]{12,16} \("/) { + $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i; + my $init_char = $1; + my $orig_commit = lc($2); + my $id = '01234567890ab'; + my $desc = 'commit description'; + ($id, $desc) = git_commit_info($orig_commit, $id, $desc); + ERROR("GIT_COMMIT_ID", + "Please use 12 to 16 chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); + } + # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", -- cgit v1.2.3 From 13f1937ef33950b1112049972249e6191b82e6c9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:10:59 -0700 Subject: checkpatch: emit a warning on file add/move/delete Whenever files are added, moved, or deleted, the MAINTAINERS file patterns can be out of sync or outdated. To try to keep MAINTAINERS more up-to-date, add a one-time warning whenever a patch does any of those. Signed-off-by: Joe Perches Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 496f9abdb930..0cf8b98d8dc9 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1679,7 +1679,7 @@ sub process { my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch - + my $reported_maintainer_file = 0; my $non_utf8_charset = 0; my $last_blank_line = 0; @@ -2012,6 +2012,17 @@ sub process { "Please use 12 to 16 chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); } +# Check for added, moved or deleted files + if (!$reported_maintainer_file && !$in_commit_log && + ($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ || + $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || + ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && + (defined($1) || defined($2))))) { + $reported_maintainer_file = 1; + WARN("FILE_PATH_CHANGES", + "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); + } + # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", -- cgit v1.2.3 From c00df19a5026f2cb08f1770dcc29226512f4d099 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:01 -0700 Subject: checkpatch: warn on break after goto or return with same tab indentation Using break; after a goto or return is unnecessary so emit a warning when the break is at the same indent level. So this emits a warning on: switch (foo) { case 1: goto err; break; } but not on: switch (foo) { case 1: if (bar()) goto err; break; } Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0cf8b98d8dc9..fc72d64cd2d4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2450,6 +2450,16 @@ sub process { } } +# check indentation of a line with a break; +# if the previous line is a goto or return and is indented the same # of tabs + if ($sline =~ /^\+([\t]+)break\s*;\s*$/) { + my $tabs = $1; + if ($prevline =~ /^\+$tabs(?:goto|return)\b/) { + WARN("UNNECESSARY_BREAK", + "break is not useful after a goto or return\n" . $hereprev); + } + } + # discourage the addition of CONFIG_EXPERIMENTAL in #if(def). if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { WARN("CONFIG_EXPERIMENTAL", -- cgit v1.2.3 From 194f66fc9567367b3fa2dcbc1b87b091330ef186 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:03 -0700 Subject: checkpatch: add an index variable for fixed lines Make the fix code a bit easier to read. This should also start to allow an easier mechanism to insert/delete lines eventually too. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Dan Carpenter Cc: Josh Triplett Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 114 ++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index fc72d64cd2d4..f905d7b50530 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -583,6 +583,8 @@ $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my @fixed = (); +my $fixlinenr = -1; + my $vname; for my $filename (@ARGV) { my $FILE; @@ -611,6 +613,7 @@ for my $filename (@ARGV) { @rawlines = (); @lines = (); @fixed = (); + $fixlinenr = -1; } exit($exit); @@ -1801,8 +1804,10 @@ sub process { $realcnt = 0; $linenr = 0; + $fixlinenr = -1; foreach my $line (@lines) { $linenr++; + $fixlinenr++; my $sline = $line; #copy of $line $sline =~ s/$;/ /g; #with comments as spaces @@ -1933,7 +1938,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Do not use whitespace before $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -1941,7 +1946,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "'$ucfirst_sign_off' is the preferred signature form\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } @@ -1950,7 +1955,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Use a single space after $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -2089,14 +2094,14 @@ sub process { if (ERROR("DOS_LINE_ENDINGS", "DOS line endings\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/[\s\015]+$//; + $fixed[$fixlinenr] =~ s/[\s\015]+$//; } } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; if (ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } $rpt_cleaners = 1; @@ -2235,7 +2240,7 @@ sub process { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", "unnecessary whitespace before a quoted newline\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; + $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; } } @@ -2272,7 +2277,7 @@ sub process { if (ERROR("CODE_INDENT", "code indent should use tabs where possible\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } @@ -2282,9 +2287,9 @@ sub process { if (WARN("SPACE_BEFORE_TAB", "please, no space before tabs\n" . $herevet) && $fix) { - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) {8,8}+\t/$1\t\t/) {} - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) +\t/$1\t/) {} } } @@ -2318,7 +2323,7 @@ sub process { if (CHK("PARENTHESIS_ALIGNMENT", "Alignment should match open parenthesis\n" . $hereprev) && $fix && $line =~ /^\+/) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^\+[ \t]*/\+$goodtabindent/; } } @@ -2329,7 +2334,7 @@ sub process { if (CHK("SPACING", "No space is necessary after a cast\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/(\(\s*$Type\s*\))[ \t]+/$1/; } } @@ -2433,7 +2438,7 @@ sub process { if (WARN("LEADING_SPACE", "please, no spaces at the start of a line\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } @@ -2798,10 +2803,10 @@ sub process { if (ERROR("C99_COMMENTS", "do not use C99 // comments\n" . $herecurr) && $fix) { - my $line = $fixed[$linenr - 1]; + my $line = $fixed[$fixlinenr]; if ($line =~ /\/\/(.*)$/) { my $comment = trim($1); - $fixed[$linenr - 1] =~ s@\/\/(.*)$@/\* $comment \*/@; + $fixed[$fixlinenr] =~ s@\/\/(.*)$@/\* $comment \*/@; } } } @@ -2860,7 +2865,7 @@ sub process { "do not initialise globals to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; } } # check for static initialisers. @@ -2869,7 +2874,7 @@ sub process { "do not initialise statics to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; } } @@ -2899,7 +2904,7 @@ sub process { if (ERROR("FUNCTION_WITHOUT_ARGS", "Bad function definition - $1() should probably be $1(void)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; + $fixed[$fixlinenr] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; } } @@ -2908,7 +2913,7 @@ sub process { if (WARN("DEFINE_PCI_DEVICE_TABLE", "Prefer struct pci_device_id over deprecated DEFINE_PCI_DEVICE_TABLE\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; + $fixed[$fixlinenr] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; } } @@ -2945,7 +2950,7 @@ sub process { my $sub_from = $ident; my $sub_to = $ident; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -2973,7 +2978,7 @@ sub process { my $sub_from = $match; my $sub_to = $match; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -3035,7 +3040,7 @@ sub process { if (WARN("PREFER_PR_LEVEL", "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\bpr_warning\b/pr_warn/; } } @@ -3069,7 +3074,7 @@ sub process { if (WARN("SPACING", "missing space after $1 definition\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*(?:typedef\s+)?(?:enum|union|struct)(?:\s+$Ident){1,2})([=\{])/$1 $2/; } } @@ -3139,7 +3144,7 @@ sub process { } if (show_type("SPACING") && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*)$Declare\s*\(\s*\*\s*$Ident\s*\)\s*\(/$1 . $declare . $post_declare_space . '(*' . $funcname . ')('/ex; } } @@ -3156,7 +3161,7 @@ sub process { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(\+.*?)\s+\[/$1\[/; } } @@ -3191,7 +3196,7 @@ sub process { if (WARN("SPACING", "space prohibited between function name and open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$name\s+\(/$name\(/; } } @@ -3459,8 +3464,8 @@ sub process { $fixed_line = $fixed_line . $fix_elements[$#elements]; } - if ($fix && $line_fixed && $fixed_line ne $fixed[$linenr - 1]) { - $fixed[$linenr - 1] = $fixed_line; + if ($fix && $line_fixed && $fixed_line ne $fixed[$fixlinenr]) { + $fixed[$fixlinenr] = $fixed_line; } @@ -3471,7 +3476,7 @@ sub process { if (WARN("SPACING", "space prohibited before semicolon\n" . $herecurr) && $fix) { - 1 while $fixed[$linenr - 1] =~ + 1 while $fixed[$fixlinenr] =~ s/^(\+.*\S)\s+;/$1;/; } } @@ -3504,7 +3509,7 @@ sub process { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/; + $fixed[$fixlinenr] =~ s/^(\+.*(?:do|\))){/$1 {/; } } @@ -3522,7 +3527,7 @@ sub process { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/}((?!(?:,|;|\)))\S)/} $1/; } } @@ -3532,7 +3537,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\[\s+/\[/; } } @@ -3540,7 +3545,7 @@ sub process { if (ERROR("SPACING", "space prohibited before that close square bracket ']'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\s+\]/\]/; } } @@ -3551,7 +3556,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\(\s+/\(/; } } @@ -3561,7 +3566,8 @@ sub process { if (ERROR("SPACING", "space prohibited before that close parenthesis ')'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + print("fixlinenr: <$fixlinenr> fixed[fixlinenr]: <$fixed[$fixlinenr]>\n"); + $fixed[$fixlinenr] =~ s/\s+\)/\)/; } } @@ -3580,7 +3586,7 @@ sub process { if (WARN("INDENTED_LABEL", "labels should not be indented\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.)\s+/$1/; } } @@ -3642,7 +3648,7 @@ sub process { if (ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b(if|while|for|switch)\(/$1 \(/; } } @@ -3779,7 +3785,7 @@ sub process { "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && $fix) { my $hexval = sprintf("0x%x", oct($var)); - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$var\b/$hexval/; } } @@ -3815,7 +3821,7 @@ sub process { if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION", "Whitespace after \\ makes next lines useless\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } } @@ -4170,7 +4176,7 @@ sub process { WARN("MISPLACED_INIT", "$attr should be placed after $var\n" . $herecurr))) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; + $fixed[$fixlinenr] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; } } } @@ -4184,7 +4190,7 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of const init definition must use ${attr_prefix}initconst\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/$InitAttributeData/${attr_prefix}initconst/; } } @@ -4195,12 +4201,12 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of $attr requires a separate use of const\n" . $herecurr) && $fix) { - my $lead = $fixed[$linenr - 1] =~ + my $lead = $fixed[$fixlinenr] =~ /(^\+\s*(?:static\s+))/; $lead = rtrim($1); $lead = "$lead " if ($lead !~ /^\+$/); $lead = "${lead}const "; - $fixed[$linenr - 1] =~ s/(^\+\s*(?:static\s+))/$lead/; + $fixed[$fixlinenr] =~ s/(^\+\s*(?:static\s+))/$lead/; } } @@ -4213,7 +4219,7 @@ sub process { if (WARN("CONSTANT_CONVERSION", "$constant_func should be $func\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b$constant_func\b/$func/g; + $fixed[$fixlinenr] =~ s/\b$constant_func\b/$func/g; } } @@ -4263,7 +4269,7 @@ sub process { if (ERROR("SPACING", "exactly one space required after that #$1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*\#\s*(ifdef|ifndef|elif))\s{2,}/$1 /; } @@ -4311,7 +4317,7 @@ sub process { if (WARN("INLINE", "plain inline is preferred over $1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/; + $fixed[$fixlinenr] =~ s/\b(__inline__|__inline)\b/inline/; } } @@ -4336,7 +4342,7 @@ sub process { if (WARN("PREFER_PRINTF", "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; } } @@ -4347,7 +4353,7 @@ sub process { if (WARN("PREFER_SCANF", "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; } } @@ -4362,7 +4368,7 @@ sub process { if (WARN("SIZEOF_PARENTHESIS", "sizeof $1 should be sizeof($1)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; } } @@ -4385,7 +4391,7 @@ sub process { if (WARN("PREFER_SEQ_PUTS", "Prefer seq_puts to seq_printf\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/; + $fixed[$fixlinenr] =~ s/\bseq_printf\b/seq_puts/; } } } @@ -4414,7 +4420,7 @@ sub process { if (WARN("PREFER_ETHER_ADDR_COPY", "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; + $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; } } @@ -4502,7 +4508,7 @@ sub process { if (CHK("AVOID_EXTERNS", "extern prototypes should be avoided in .h files\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; + $fixed[$fixlinenr] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; } } @@ -4579,7 +4585,7 @@ sub process { if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; + $fixed[$fixlinenr] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; } } @@ -4603,7 +4609,7 @@ sub process { if (WARN("ONE_SEMICOLON", "Statements terminations use 1 semicolon\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g; + $fixed[$fixlinenr] =~ s/(\s*;\s*){2,}$/;/g; } } @@ -4651,7 +4657,7 @@ sub process { if (WARN("USE_FUNC", "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g; + $fixed[$fixlinenr] =~ s/\b__FUNCTION__\b/__func__/g; } } -- cgit v1.2.3 From d752fcc88b7dddc0bbe6d43b658b306a9d7d1dae Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:05 -0700 Subject: checkpatch: add ability to insert and delete lines to patch/file This can be valuable to insert or delete blank lines as well as fix misplaced brace or else uses. Store indexes of lines to be added/deleted and the new lines. When creating the --fix file, insert or delete the appropriate lines and update the patch range information. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Dan Carpenter Cc: Josh Triplett Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 141 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 11 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f905d7b50530..f3d9a883f8be 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -583,6 +583,8 @@ $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my @fixed = (); +my @fixed_inserted = (); +my @fixed_deleted = (); my $fixlinenr = -1; my $vname; @@ -613,6 +615,8 @@ for my $filename (@ARGV) { @rawlines = (); @lines = (); @fixed = (); + @fixed_inserted = (); + @fixed_deleted = (); $fixlinenr = -1; } @@ -1526,6 +1530,69 @@ sub report_dump { our @report; } +sub fixup_current_range { + my ($lineRef, $offset, $length) = @_; + + if ($$lineRef =~ /^\@\@ -\d+,\d+ \+(\d+),(\d+) \@\@/) { + my $o = $1; + my $l = $2; + my $no = $o + $offset; + my $nl = $l + $length; + $$lineRef =~ s/\+$o,$l \@\@/\+$no,$nl \@\@/; + } +} + +sub fix_inserted_deleted_lines { + my ($linesRef, $insertedRef, $deletedRef) = @_; + + my $range_last_linenr = 0; + my $delta_offset = 0; + + my $old_linenr = 0; + my $new_linenr = 0; + + my $next_insert = 0; + my $next_delete = 0; + + my @lines = (); + + my $inserted = @{$insertedRef}[$next_insert++]; + my $deleted = @{$deletedRef}[$next_delete++]; + + foreach my $old_line (@{$linesRef}) { + my $save_line = 1; + my $line = $old_line; #don't modify the array + if ($line =~ /^(?:\+\+\+\|\-\-\-)\s+\S+/) { #new filename + $delta_offset = 0; + } elsif ($line =~ /^\@\@ -\d+,\d+ \+\d+,\d+ \@\@/) { #new hunk + $range_last_linenr = $new_linenr; + fixup_current_range(\$line, $delta_offset, 0); + } + + while (defined($deleted) && ${$deleted}{'LINENR'} == $old_linenr) { + $deleted = @{$deletedRef}[$next_delete++]; + $save_line = 0; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset--, -1); + } + + while (defined($inserted) && ${$inserted}{'LINENR'} == $old_linenr) { + push(@lines, ${$inserted}{'LINE'}); + $inserted = @{$insertedRef}[$next_insert++]; + $new_linenr++; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset++, 1); + } + + if ($save_line) { + push(@lines, $line); + $new_linenr++; + } + + $old_linenr++; + } + + return @lines; +} + sub ERROR { my ($type, $msg) = @_; @@ -2377,16 +2444,31 @@ sub process { $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ || $line =~ /^\+\s*DECLARE/ || $line =~ /^\+\s*__setup/)) { - CHK("LINE_SPACING", - "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev); + if (CHK("LINE_SPACING", + "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) && + $fix) { + my $inserted = { + LINENR => $fixlinenr, + LINE => "\+", + }; + push(@fixed_inserted, $inserted); + } } # check for multiple consecutive blank lines if ($prevline =~ /^[\+ ]\s*$/ && $line =~ /^\+\s*$/ && $last_blank_line != ($linenr - 1)) { - CHK("LINE_SPACING", - "Please don't use multiple blank lines\n" . $hereprev); + if (CHK("LINE_SPACING", + "Please don't use multiple blank lines\n" . $hereprev) && + $fix) { + my $deleted = { + LINENR => $fixlinenr, + LINE => $rawline, + }; + push(@fixed_deleted, $deleted); + } + $last_blank_line = $linenr; } @@ -2424,8 +2506,15 @@ sub process { $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && # indentation of previous and current line are the same (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - WARN("LINE_SPACING", - "Missing a blank line after declarations\n" . $hereprev); + if (WARN("LINE_SPACING", + "Missing a blank line after declarations\n" . $hereprev) && + $fix) { + my $inserted = { + LINENR => $fixlinenr, + LINE => "\+", + }; + push(@fixed_inserted, $inserted); + } } # check for spaces at the beginning of a line. @@ -2627,7 +2716,7 @@ sub process { #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; - if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { + if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); @@ -2777,8 +2866,34 @@ sub process { # check for initialisation to aggregates open brace on the next line if ($line =~ /^.\s*{/ && $prevline =~ /(?:^|[^=])=\s*$/) { - ERROR("OPEN_BRACE", - "that open brace { should be on the previous line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "that open brace { should be on the previous line\n" . $hereprev) && + $fix && $prevline =~ /^\+/) { + my $deleted = { + LINENR => $fixlinenr - 1, + LINE => $prevrawline, + }; + push(@fixed_deleted, $deleted); + $deleted = { + LINENR => $fixlinenr, + LINE => $rawline, + }; + push(@fixed_deleted, $deleted); + my $fixedline = $prevrawline; + $fixedline =~ s/\s*=\s*$/ = {/; + my $inserted = { + LINENR => $fixlinenr, + LINE => $fixedline, + }; + push(@fixed_inserted, $inserted); + $fixedline = $line; + $fixedline =~ s/^(.\s*){\s*/$1/; + $inserted = { + LINENR => $fixlinenr, + LINE => $fixedline, + }; + push(@fixed_inserted, $inserted); + } } # @@ -4900,12 +5015,16 @@ sub process { hash_show_words(\%use_type, "Used"); hash_show_words(\%ignore_type, "Ignored"); - if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { + if ($clean == 0 && $fix && + ("@rawlines" ne "@fixed" || + $#fixed_inserted >= 0 || $#fixed_deleted >= 0)) { my $newfile = $filename; $newfile .= ".EXPERIMENTAL-checkpatch-fixes" if (!$fix_inplace); my $linecount = 0; my $f; + @fixed = fix_inserted_deleted_lines(\@fixed, \@fixed_inserted, \@fixed_deleted); + open($f, '>', $newfile) or die "$P: Can't open $newfile for write\n"; foreach my $fixed_line (@fixed) { @@ -4913,7 +5032,7 @@ sub process { if ($file) { if ($linecount > 3) { $fixed_line =~ s/^\+//; - print $f $fixed_line. "\n"; + print $f $fixed_line . "\n"; } } else { print $f $fixed_line . "\n"; -- cgit v1.2.3 From f2d7e4d4398092d14fb039cb4d38e502d3f019ee Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:07 -0700 Subject: checkpatch: add fix_insert_line and fix_delete_line helpers Neaten the uses of patch/file line insertions or deletions. Hide the mechanism used. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 65 +++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f3d9a883f8be..7c290693b8af 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1593,6 +1593,27 @@ sub fix_inserted_deleted_lines { return @lines; } +sub fix_insert_line { + my ($linenr, $line) = @_; + + my $inserted = { + LINENR => $linenr, + LINE => $line, + }; + push(@fixed_inserted, $inserted); +} + +sub fix_delete_line { + my ($linenr, $line) = @_; + + my $deleted = { + LINENR => $linenr, + LINE => $line, + }; + + push(@fixed_deleted, $deleted); +} + sub ERROR { my ($type, $msg) = @_; @@ -2447,11 +2468,7 @@ sub process { if (CHK("LINE_SPACING", "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) && $fix) { - my $inserted = { - LINENR => $fixlinenr, - LINE => "\+", - }; - push(@fixed_inserted, $inserted); + fix_insert_line($fixlinenr, "\+"); } } @@ -2462,11 +2479,7 @@ sub process { if (CHK("LINE_SPACING", "Please don't use multiple blank lines\n" . $hereprev) && $fix) { - my $deleted = { - LINENR => $fixlinenr, - LINE => $rawline, - }; - push(@fixed_deleted, $deleted); + fix_delete_line($fixlinenr, $rawline); } $last_blank_line = $linenr; @@ -2509,11 +2522,7 @@ sub process { if (WARN("LINE_SPACING", "Missing a blank line after declarations\n" . $hereprev) && $fix) { - my $inserted = { - LINENR => $fixlinenr, - LINE => "\+", - }; - push(@fixed_inserted, $inserted); + fix_insert_line($fixlinenr, "\+"); } } @@ -2868,31 +2877,15 @@ sub process { $prevline =~ /(?:^|[^=])=\s*$/) { if (ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . $hereprev) && - $fix && $prevline =~ /^\+/) { - my $deleted = { - LINENR => $fixlinenr - 1, - LINE => $prevrawline, - }; - push(@fixed_deleted, $deleted); - $deleted = { - LINENR => $fixlinenr, - LINE => $rawline, - }; - push(@fixed_deleted, $deleted); + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); my $fixedline = $prevrawline; $fixedline =~ s/\s*=\s*$/ = {/; - my $inserted = { - LINENR => $fixlinenr, - LINE => $fixedline, - }; - push(@fixed_inserted, $inserted); + fix_insert_line($fixlinenr, $fixedline); $fixedline = $line; $fixedline =~ s/^(.\s*){\s*/$1/; - $inserted = { - LINENR => $fixlinenr, - LINE => $fixedline, - }; - push(@fixed_inserted, $inserted); + fix_insert_line($fixlinenr, $fixedline); } } -- cgit v1.2.3 From bd474ca076af8ac6fa8c5f4167f6024b446a08c8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:10 -0700 Subject: checkpatch: use the correct indentation for which() I copied the which subroutine from get_maintainer.pl. Unfortunately, get_maintainer uses a 4 space indentation so use the proper tab indentation instead. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7c290693b8af..569ba315823e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -710,15 +710,15 @@ sub format_email { } sub which { - my ($bin) = @_; + my ($bin) = @_; - foreach my $path (split(/:/, $ENV{PATH})) { - if (-e "$path/$bin") { - return "$path/$bin"; + foreach my $path (split(/:/, $ENV{PATH})) { + if (-e "$path/$bin") { + return "$path/$bin"; + } } - } - return ""; + return ""; } sub which_conf { -- cgit v1.2.3 From 8d1824780f2f1786db5e0e7a54bbae75340c655c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:12 -0700 Subject: checkpatch: add --fix option for a couple OPEN_BRACE misuses Style misuses of these types are corrected: typedef struct foo { int bar; }; int foo(int bar) { return bar+1; } int foo(int bar) { return bar+1; } Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 569ba315823e..1e95953b488f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3164,17 +3164,40 @@ sub process { # function brace can't be on same line, except for #defines of do while, # or if closed on same line - if (($line=~/$Type\s*$Ident\(.*\).*\s{/) and + if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { - ERROR("OPEN_BRACE", - "open brace '{' following function declarations go on the next line\n" . $herecurr); + if (ERROR("OPEN_BRACE", + "open brace '{' following function declarations go on the next line\n" . $herecurr) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + my $fixed_line = $rawline; + $fixed_line =~ /(^..*$Type\s*$Ident\(.*\)\s*){(.*)$/; + my $line1 = $1; + my $line2 = $2; + fix_insert_line($fixlinenr, ltrim($line1)); + fix_insert_line($fixlinenr, "\+{"); + if ($line2 !~ /^\s*$/) { + fix_insert_line($fixlinenr, "\+\t" . trim($line2)); + } + } } # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { - ERROR("OPEN_BRACE", - "open brace '{' following $1 go on the same line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "open brace '{' following $1 go on the same line\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = rtrim($prevrawline) . " {"; + fix_insert_line($fixlinenr, $fixedline); + $fixedline = $rawline; + $fixedline =~ s/^(.\s*){\s*/$1\t/; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + } } # missing space after union, struct or enum definition -- cgit v1.2.3 From 8b8856f4b102ce148611322465f2ff8932664411 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:14 -0700 Subject: checkpatch: fix brace style misuses of else and while Add --fix corrections for ELSE_AFTER_BRACE and WHILE_AFTER_BRACE misuses. if (x) { ... } else { ... } is corrected to if (x) { ... } else { ... } and do { ... } while (x); is corrected to do { ... } while (x); Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1e95953b488f..404e3d6906aa 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3885,14 +3885,26 @@ sub process { # Check for }else {, these must be at the same # indent level to be relevant to each other. - if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ and - $previndent == $indent) { - ERROR("ELSE_AFTER_BRACE", - "else should follow close brace '}'\n" . $hereprev); + if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ && + $previndent == $indent) { + if (ERROR("ELSE_AFTER_BRACE", + "else should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + $fixedline =~ s/}\s*$//; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + $fixedline = $rawline; + $fixedline =~ s/^(.\s*)else/$1} else/; + fix_insert_line($fixlinenr, $fixedline); + } } - if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ and - $previndent == $indent) { + if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ && + $previndent == $indent) { my ($s, $c) = ctx_statement_block($linenr, $realcnt, 0); # Find out what is on the end of the line after the @@ -3901,8 +3913,18 @@ sub process { $s =~ s/\n.*//g; if ($s =~ /^\s*;/) { - ERROR("WHILE_AFTER_BRACE", - "while should follow close brace '}'\n" . $hereprev); + if (ERROR("WHILE_AFTER_BRACE", + "while should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + my $trailing = $rawline; + $trailing =~ s/^\+//; + $trailing = trim($trailing); + $fixedline =~ s/}\s*$/} $trailing/; + fix_insert_line($fixlinenr, $fixedline); + } } } -- cgit v1.2.3 From 0fe3dc2bc554b849c27c0ebc36daedf032514200 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:16 -0700 Subject: checkpatch: add for_each tests to indentation and brace tests All the various for_each loop macros were not tested for trailing brace on the following lines and for bad indentation. Add them. Signed-off-by: Joe Perches Reported-by: Greg KH Cc: Andy Whitcroft Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 404e3d6906aa..dc72a9b3172d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2698,7 +2698,7 @@ sub process { # if/while/etc brace do not go on next line, unless defining a do while loop, # or if that brace on the next line is for something else - if ($line =~ /(.*)\b((?:if|while|for|switch)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { + if ($line =~ /(.*)\b((?:if|while|for|switch|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { my $pre_ctx = "$1$2"; my ($level, @ctx) = ctx_statement_level($linenr, $realcnt, 0); @@ -2744,7 +2744,7 @@ sub process { } # Check relative indent for conditionals and blocks. - if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { + if ($line =~ /\b(?:(?:if|while|for|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); -- cgit v1.2.3 From 3f7bc4e1fcfab05f3e8b49134cfb3e16b8876aae Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:18 -0700 Subject: checkpatch: add short int to c variable types short int is one of the 6.7.2 c90 types. Find it appropriately. This fixes a defect in checkpatch where it suggests that a line break after declaration is required using an input like: int foo; short int bar; Without this change, it warns on the short int line. Signed-off-by: Joe Perches Reported-by: Hartley Sweeten Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dc72a9b3172d..c647caab913c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -356,6 +356,7 @@ our $signature_tags = qr{(?xi: our @typeList = ( qr{void}, qr{(?:unsigned\s+)?char}, + qr{(?:unsigned\s+)?short\s+int}, qr{(?:unsigned\s+)?short}, qr{(?:unsigned\s+)?int}, qr{(?:unsigned\s+)?long}, -- cgit v1.2.3 From 0c773d9d66684aefd919c4b4550fe16003c54c0e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:20 -0700 Subject: checkpatch: add signed generic types Current generic types are unsigned or unspecified. Add signed to the types. Reorder the types to find the longest match first. Signed-off-by: Joe Perches Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c647caab913c..c10befa118a5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -355,15 +355,15 @@ our $signature_tags = qr{(?xi: our @typeList = ( qr{void}, - qr{(?:unsigned\s+)?char}, - qr{(?:unsigned\s+)?short\s+int}, - qr{(?:unsigned\s+)?short}, - qr{(?:unsigned\s+)?int}, - qr{(?:unsigned\s+)?long}, - qr{(?:unsigned\s+)?long\s+int}, - qr{(?:unsigned\s+)?long\s+long}, - qr{(?:unsigned\s+)?long\s+long\s+int}, - qr{unsigned}, + qr{(?:(?:un)?signed\s+)?char}, + qr{(?:(?:un)?signed\s+)?short\s+int}, + qr{(?:(?:un)?signed\s+)?short}, + qr{(?:(?:un)?signed\s+)?int}, + qr{(?:(?:un)?signed\s+)?long\s+int}, + qr{(?:(?:un)?signed\s+)?long\s+long\s+int}, + qr{(?:(?:un)?signed\s+)?long\s+long}, + qr{(?:(?:un)?signed\s+)?long}, + qr{(?:un)?signed}, qr{float}, qr{double}, qr{bool}, -- cgit v1.2.3 From 1813087dbc54ff1485bcc84b66d34052eaf23387 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:22 -0700 Subject: checkpatch: add test for native c90 types in unusual order c90 section "6.7.2 Type Specifiers" says: "type specifiers may occur in any order" That means that: short int is the same as int short unsigned short int is the same as int unsigned short etc... checkpatch currently parses only a subset of these allowed types. For instance: "unsigned short" and "signed short" are found by checkpatch as a specific type, but none of the or "int short" or "int signed short" variants are found. Add another table for the "kernel style misordered" variants. Add this misordered table to the findable types. Warn when the misordered style is used. This improves the "Missing a blank line after declarations" test as it depends on the correct parsing of the $Declare variable which looks for "$Type $Ident;" (ie: declarations like "int foo;"). Signed-off-by: Joe Perches Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c10befa118a5..10384cf40691 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -309,9 +309,12 @@ our $Operators = qr{ our $c90_Keywords = qr{do|for|while|if|else|return|goto|continue|switch|default|case|break}x; our $NonptrType; +our $NonptrTypeMisordered; our $NonptrTypeWithAttr; our $Type; +our $TypeMisordered; our $Declare; +our $DeclareMisordered; our $NON_ASCII_UTF8 = qr{ [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte @@ -353,6 +356,25 @@ our $signature_tags = qr{(?xi: Cc: )}; +our @typeListMisordered = ( + qr{char\s+(?:un)?signed}, + qr{int\s+(?:(?:un)?signed\s+)?short\s}, + qr{int\s+short(?:\s+(?:un)?signed)}, + qr{short\s+int(?:\s+(?:un)?signed)}, + qr{(?:un)?signed\s+int\s+short}, + qr{short\s+(?:un)?signed}, + qr{long\s+int\s+(?:un)?signed}, + qr{int\s+long\s+(?:un)?signed}, + qr{long\s+(?:un)?signed\s+int}, + qr{int\s+(?:un)?signed\s+long}, + qr{int\s+(?:un)?signed}, + qr{int\s+long\s+long\s+(?:un)?signed}, + qr{long\s+long\s+int\s+(?:un)?signed}, + qr{long\s+long\s+(?:un)?signed\s+int}, + qr{long\s+long\s+(?:un)?signed}, + qr{long\s+(?:un)?signed}, +); + our @typeList = ( qr{void}, qr{(?:(?:un)?signed\s+)?char}, @@ -373,6 +395,7 @@ our @typeList = ( qr{${Ident}_t}, qr{${Ident}_handler}, qr{${Ident}_handler_fn}, + @typeListMisordered, ); our @typeListWithAttr = ( @typeList, @@ -414,6 +437,7 @@ our $allowed_asm_includes = qr{(?x: sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; + my $Misordered = "(?x: \n" . join("|\n ", @typeListMisordered) . "\n)"; my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ @@ -425,6 +449,13 @@ sub build_types { ) (?:\s+$Modifier|\s+const)* }x; + $NonptrTypeMisordered = qr{ + (?:$Modifier\s+|const\s+)* + (?: + (?:${Misordered}\b) + ) + (?:\s+$Modifier|\s+const)* + }x; $NonptrTypeWithAttr = qr{ (?:$Modifier\s+|const\s+)* (?: @@ -439,7 +470,13 @@ sub build_types { (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; + $TypeMisordered = qr{ + $NonptrTypeMisordered + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? + (?:\s+$Inline|\s+$Modifier)* + }x; $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; + $DeclareMisordered = qr{(?:$Storage\s+(?:$Inline\s+)?)?$TypeMisordered}; } build_types(); @@ -2987,6 +3024,13 @@ sub process { } } +# check for misordered declarations of char/short/int/long with signed/unsigned + while ($sline =~ m{(\b$TypeMisordered\b)}g) { + my $tmp = trim($1); + WARN("MISORDERED_TYPE", + "type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr); + } + # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY", -- cgit v1.2.3 From e81f239b4db2ad6c4b029ed92f0222601ce42abe Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:25 -0700 Subject: checkpatch: fix false positive MISSING_BREAK warnings with --file Using --file mode can give false positives with MISSING_BREAK fall-through warnings on simple but long multiple consecutive case statements. Look for all lines before a case statement for a switch or a statement when using --file mode. Fix a misspelling of preceded while there. Signed-off-by: Joe Perches Reported-by: Lee Jones Acked-by: Lee Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 10384cf40691..a0880ede3db9 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4811,13 +4811,13 @@ sub process { } } -# check for case / default statements not preceeded by break/fallthrough/switch +# check for case / default statements not preceded by break/fallthrough/switch if ($line =~ /^.\s*(?:case\s+(?:$Ident|$Constant)\s*|default):/) { my $has_break = 0; my $has_statement = 0; my $count = 0; my $prevline = $linenr; - while ($prevline > 1 && $count < 3 && !$has_break) { + while ($prevline > 1 && ($file || $count < 3) && !$has_break) { $prevline--; my $rline = $rawlines[$prevline - 1]; my $fline = $lines[$prevline - 1]; -- cgit v1.2.3 From 308cc8d8f0deab2c5a5162316277ced556acc71f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:27 -0700 Subject: checkpatch: fix false positives for --strict "space after cast" test Commit 89da401f6cff ("checkpatch: improve "no space after cast" test") in -next improved the cast test for non pointer types, but also introduced false positives for some types of static inlines. Add a test for an open brace to the exclusions to avoid these false positives. Signed-off-by: Joe Perches Reported-by: Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index a0880ede3db9..9f14bf928cc7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2456,7 +2456,7 @@ sub process { } } - if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic)/) { + if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic|{)/) { if (CHK("SPACING", "No space is necessary after a cast\n" . $herecurr) && $fix) { -- cgit v1.2.3 From ece9659f16e369d344fe4325d87fab3bb50d1fe2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Aug 2014 16:11:29 -0700 Subject: checkpatch: warn on missing spaces in broken up quoted Checkpatch already complains when people break up quoted strings but it's still pretty common. One mistake that people often make is they leave out the space character between the two strings. This check adds around 450 new warnings and has a low rate of false positives. Signed-off-by: Dan Carpenter Cc: Andy Whitcroft Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9f14bf928cc7..da74e65064d1 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2361,6 +2361,12 @@ sub process { "quoted string split across lines\n" . $hereprev); } +# check for missing a space in a string concatination + if ($prevrawline =~ /[^\\]\w"$/ && $rawline =~ /^\+[\t ]+"\w/) { + WARN('MISSING_SPACE', + "break quoted strings at a space character\n" . $hereprev); + } + # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", -- cgit v1.2.3 From f84223087402c45179be5e7060c5736c17a7b271 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 6 Aug 2014 16:11:31 -0700 Subject: checkpatch: update $declaration_macros, add uninitialized_var Using uninitialized_var reports a false positive for "Missing blank line after declarations". Fix it by adding uninitialized_var to the $declaration_macros exceptions list. Move the macro list after $Type is declared. Add optional prefixes to DECLARE_ and DEFINE_ macro declarations to allow forms like: MLX4_DECLARE_DOORBELL_LOCK Signed-off-by: Joe Perches Reported-by: Dotan Barak Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index da74e65064d1..31a731e06f50 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -423,11 +423,6 @@ foreach my $entry (@mode_permission_funcs) { $mode_perms_search .= $entry->[0]; } -our $declaration_macros = qr{(?x: - (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(| - (?:$Storage\s+)?LIST_HEAD\s*\( -)}; - our $allowed_asm_includes = qr{(?x: irq| memory @@ -490,6 +485,12 @@ our $balanced_parens = qr/(\((?:[^\(\)]++|(?-1))*\))/; our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*}; our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; +our $declaration_macros = qr{(?x: + (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(| + (?:$Storage\s+)?LIST_HEAD\s*\(| + (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( +)}; + sub deparenthesize { my ($string) = @_; return "" if (!defined($string)); -- cgit v1.2.3