From 14be27460e4722d7135de3c46d043b4fc4382247 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Sep 2009 13:05:42 -0700 Subject: libfs: make simple_read_from_buffer conventional Impact: have simple_read_from_buffer conform to standards It was brought to my attention by Andrew Morton, Theodore Tso, and H. Peter Anvin that a read from userspace should only return -EFAULT if nothing was actually read. Looking at the simple_read_from_buffer I noticed that this function does not conform to that rule. This patch fixes that function. [akpm@linux-foundation.org: simplification suggested by hpa] [hpa@zytor.com: fix count==0 handling] Signed-off-by: Steven Rostedt Cc: Al Viro Cc: Theodore Ts'o Cc: Ingo Molnar Signed-off-by: H. Peter Anvin Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/libfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index dcec3d3ea64..662a28e4f66 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; + size_t ret; + if (pos < 0) return -EINVAL; - if (pos >= available) + if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; - if (copy_to_user(to, from + pos, count)) + ret = copy_to_user(to, from + pos, count); + if (ret == count) return -EFAULT; + count -= ret; *ppos = pos + count; return count; } -- cgit v1.2.3 From af0d9ae811d11de8a01d6bc922c5e062be01bd7f Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Fri, 18 Sep 2009 13:05:43 -0700 Subject: fs/inode.c: add dev-id and inode number for debugging in init_special_inode() Add device-id and inode number for better debugging. This was suggested by Andreas in one of the threads http://article.gmane.org/gmane.comp.file-systems.ext4/12062 . "If anyone has a chance, fixing this error message to be not-useless would be good... Including the device name and the inode number would help track down the source of the problem." Signed-off-by: Manish Katiyar Cc: Andreas Dilger Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 76582b06ab9..07d775ea616 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1599,7 +1599,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else - printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", - mode); + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" + " inode %s:%lu\n", mode, inode->i_sb->s_id, + inode->i_ino); } EXPORT_SYMBOL(init_special_inode); -- cgit v1.2.3 From 22fe404218156328a27e66349b1175cd0baa4990 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 18 Sep 2009 13:05:44 -0700 Subject: vfs: split generic_forget_inode() so that hugetlbfs does not have to copy it Hugetlbfs needs to do special things instead of truncate_inode_pages(). Currently, it copied generic_forget_inode() except for truncate_inode_pages() call which is asking for trouble (the code there isn't trivial). So create a separate function generic_detach_inode() which does all the list magic done in generic_forget_inode() and call it from hugetlbfs_forget_inode(). Signed-off-by: Jan Kara Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/hugetlbfs/inode.c | 33 ++++----------------------------- fs/inode.c | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index eba6d552d9c..478a169a262 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode) static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) { - struct super_block *sb = inode->i_sb; - - if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; - if (!sb || (sb->s_flags & MS_ACTIVE)) { - spin_unlock(&inode_lock); - return; - } - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); - /* - * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK - * in our backing_dev_info. - */ - write_inode_now(inode, 1); - spin_lock(&inode_lock); - inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + if (generic_detach_inode(inode)) { + truncate_hugepages(inode, 0); + clear_inode(inode); + destroy_inode(inode); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); - inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); - truncate_hugepages(inode, 0); - clear_inode(inode); - destroy_inode(inode); } static void hugetlbfs_drop_inode(struct inode *inode) diff --git a/fs/inode.c b/fs/inode.c index 07d775ea616..fa506d53965 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode) } EXPORT_SYMBOL(generic_delete_inode); -static void generic_forget_inode(struct inode *inode) +/** + * generic_detach_inode - remove inode from inode lists + * @inode: inode to remove + * + * Remove inode from inode lists, write it if it's dirty. This is just an + * internal VFS helper exported for hugetlbfs. Do not use! + * + * Returns 1 if inode should be completely destroyed. + */ +int generic_detach_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode) inodes_stat.nr_unused++; if (sb->s_flags & MS_ACTIVE) { spin_unlock(&inode_lock); - return; + return 0; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; @@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); + return 1; +} +EXPORT_SYMBOL_GPL(generic_detach_inode); + +static void generic_forget_inode(struct inode *inode) +{ + if (!generic_detach_inode(inode)) + return; if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); -- cgit v1.2.3 From b12536c27043f1c21195e587eb59950428326e22 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 18 Sep 2009 13:05:47 -0700 Subject: vfs: optimization for touch_atime() Some benchmark testing shows touch_atime to be high up in profile logs for IO intensive workloads. Most likely that's due to the lock in mnt_want_write(). Unfortunately touch_atime first takes the lock, and then does all the other tests that could avoid atime updates (like noatime or relatime). Do it the other way round -- first try to avoid the update and only then if that didn't succeed take the lock. That works because none of the atime avoidance tests rely on locking. This also eliminates a goto. Signed-off-by: Andi Kleen Cc: Christoph Hellwig Reviewed-by: Valerie Aurora Cc: Al Viro Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index fa506d53965..31168fd45bd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1416,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec now; - if (mnt_want_write(mnt)) - return; if (inode->i_flags & S_NOATIME) - goto out; + return; if (IS_NOATIME(inode)) - goto out; + return; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - goto out; + return; if (mnt->mnt_flags & MNT_NOATIME) - goto out; + return; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - goto out; + return; now = current_fs_time(inode->i_sb); if (!relatime_need_update(mnt, inode, now)) - goto out; + return; if (timespec_equal(&inode->i_atime, &now)) - goto out; + return; + + if (mnt_want_write(mnt)) + return; inode->i_atime = now; mark_inode_dirty_sync(inode); -out: mnt_drop_write(mnt); } EXPORT_SYMBOL(touch_atime); -- cgit v1.2.3 From ce06e0b21d6732a2bab10a585a3ec6909499be28 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 18 Sep 2009 13:05:48 -0700 Subject: vfs: optimize touch_time() too Do a similar optimization as earlier for touch_atime. Getting the lock in mnt_get_write is relatively costly, so try all avenues to avoid it first. This patch is careful to still only update inode fields inside the lock region. This didn't show up in benchmarks, but it's easy enough to do. [akpm@linux-foundation.org: fix typo in comment] [hugh.dickins@tiscali.co.uk: fix inverted test of mnt_want_write_file()] Signed-off-by: Andi Kleen Cc: Christoph Hellwig Cc: Valerie Aurora Cc: Al Viro Cc: Dave Hansen Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/inode.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 31168fd45bd..4d8e3be5597 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1461,34 +1461,37 @@ void file_update_time(struct file *file) { struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; - int sync_it = 0; - int err; + enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return; - err = mnt_want_write_file(file); - if (err) - return; - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) { - inode->i_mtime = now; - sync_it = 1; - } + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; - if (!timespec_equal(&inode->i_ctime, &now)) { - inode->i_ctime = now; - sync_it = 1; - } + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; - if (IS_I_VERSION(inode)) { - inode_inc_iversion(inode); - sync_it = 1; - } + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; - if (sync_it) - mark_inode_dirty_sync(inode); + if (!sync_it) + return; + + /* Finally allowed to write? Takes lock. */ + if (mnt_want_write_file(file)) + return; + + /* Only change inode inside the lock region */ + if (sync_it & S_VERSION) + inode_inc_iversion(inode); + if (sync_it & S_CTIME) + inode->i_ctime = now; + if (sync_it & S_MTIME) + inode->i_mtime = now; + mark_inode_dirty_sync(inode); mnt_drop_write(file->f_path.mnt); } EXPORT_SYMBOL(file_update_time); -- cgit v1.2.3 From 7a62cc10215838286c747f86766063d5f01fcbd6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 18 Sep 2009 13:05:59 -0700 Subject: seq_file: return a negative error code when seq_path_root() fails. seq_path_root() is returning a return value of successful __d_path() instead of returning a negative value when mangle_path() failed. This is not a bug so far because nobody is using return value of seq_path_root(). Signed-off-by: Tetsuo Handa Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/seq_file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 6c959275f2d..66efd0aa8fb 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -470,6 +470,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, m->count = s - m->buf; return 0; } + err = -ENAMETOOLONG; } } m->count = m->size; -- cgit v1.2.3 From 05cc0cee6948fc11985d11557fb130645a7f69a6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 18 Sep 2009 13:06:03 -0700 Subject: libfs: return error code on failed attr set Currently all simple_attr.set handlers return 0 on success and negative codes on error. Fix simple_attr_write() to return these error codes. Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Cc: Al Viro Cc: Christoph Hellwig Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/libfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 662a28e4f66..219576c52d8 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -739,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, if (copy_from_user(attr->set_buf, buf, size)) goto out; - ret = len; /* claim we got the whole input */ attr->set_buf[size] = '\0'; val = simple_strtol(attr->set_buf, NULL, 0); - attr->set(attr->data, val); + ret = attr->set(attr->data, val); + if (ret == 0) + ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; -- cgit v1.2.3 From 5aa98b706e83da4cde4172c890d6e815915536a0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 18 Sep 2009 13:05:50 -0700 Subject: vfs: explicitly cast s_maxbytes in fiemap_check_ranges If fiemap_check_ranges is passed a large enough value, then it's possible that the value would be cast to a signed value for comparison against s_maxbytes when we change it to loff_t. Make sure that doesn't happen by explicitly casting s_maxbytes to an unsigned value for the purposes of comparison. Signed-off-by: Jeff Layton Cc: Christoph Hellwig Cc: Robert Love Cc: Al Viro Cc: Johannes Weiner Cc: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/ioctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 5612880fcbe..7b17a14396f 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags); static int fiemap_check_ranges(struct super_block *sb, u64 start, u64 len, u64 *new_len) { + u64 maxbytes = (u64) sb->s_maxbytes; + *new_len = len; if (len == 0) return -EINVAL; - if (start > sb->s_maxbytes) + if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if ((len > sb->s_maxbytes) || - (sb->s_maxbytes - len) < start) - *new_len = sb->s_maxbytes - start; + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; return 0; } -- cgit v1.2.3 From 42cb56ae2ab67390da34906b27bedc3f2ff1393b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 18 Sep 2009 13:05:53 -0700 Subject: vfs: change sb->s_maxbytes to a loff_t sb->s_maxbytes is supposed to indicate the maximum size of a file that can exist on the filesystem. It's declared as an unsigned long long. Even if a filesystem has no inherent limit that prevents it from using every bit in that unsigned long long, it's still problematic to set it to anything larger than MAX_LFS_FILESIZE. There are places in the kernel that cast s_maxbytes to a signed value. If it's set too large then this cast makes it a negative number and generally breaks the comparison. Change s_maxbytes to be loff_t instead. That should help eliminate the temptation to set it too large by making it a signed value. Also, add a warning for couple of releases to help catch filesystems that set s_maxbytes too large. Eventually we can either convert this to a BUG() or just remove it and in the hope that no one will get it wrong now that it's a signed value. Signed-off-by: Jeff Layton Cc: Johannes Weiner Cc: Christoph Hellwig Cc: Al Viro Cc: Robert Love Cc: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 0e7207b9815..4906e2d8f40 100644 --- a/fs/super.c +++ b/fs/super.c @@ -892,6 +892,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error) goto out_sb; + /* + * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE + * but s_maxbytes was an unsigned long long for many releases. Throw + * this warning for a little while to try and catch filesystems that + * violate this rule. This warning should be either removed or + * converted to a BUG() in 2.6.34. + */ + WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); + mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; up_write(&mnt->mnt_sb->s_umount); -- cgit v1.2.3 From f9098980ffea9c749622ff8ddf3b6d5831902a46 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 18 Sep 2009 13:05:53 -0700 Subject: vfs: remove redundant position check in do_sendfile As Johannes Weiner pointed out, one of the range checks in do_sendfile is redundant and is already checked in rw_verify_area. Signed-off-by: Jeff Layton Reviewed-by: Johannes Weiner Cc: Christoph Hellwig Cc: Al Viro Cc: Robert Love Cc: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/read_write.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 6c8c55dec2b..3ac28987f22 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); pos = *ppos; - retval = -EINVAL; - if (unlikely(pos < 0)) - goto fput_out; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) -- cgit v1.2.3 From f84398068d9c2babe41500504ef247ae07081857 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Sep 2009 14:48:36 +0200 Subject: vfs: seq_file: add helpers for data filling Add two helpers that allow access to the seq_file's own buffer, but hide the internal details of seq_files. This allows easier implementation of special purpose filling functions. It also cleans up some existing functions which duplicated the seq_file logic. Make these inline functions in seq_file.h, as suggested by Al. Signed-off-by: Miklos Szeredi Acked-by: Hugh Dickins Signed-off-by: Al Viro --- fs/seq_file.c | 75 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 66efd0aa8fb..eae7d9dbf3f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path); */ int seq_path(struct seq_file *m, struct path *path, char *esc) { - if (m->count < m->size) { - char *s = m->buf + m->count; - char *p = d_path(path, s, m->size - m->count); + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = d_path(path, buf, size); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; } } - m->count = m->size; - return -1; + seq_commit(m, res); + + return res; } EXPORT_SYMBOL(seq_path); @@ -454,27 +455,28 @@ EXPORT_SYMBOL(seq_path); int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc) { - int err = -ENAMETOOLONG; - if (m->count < m->size) { - char *s = m->buf + m->count; + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -ENAMETOOLONG; + + if (size) { char *p; spin_lock(&dcache_lock); - p = __d_path(path, root, s, m->size - m->count); + p = __d_path(path, root, buf, size); spin_unlock(&dcache_lock); - err = PTR_ERR(p); + res = PTR_ERR(p); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return 0; - } - err = -ENAMETOOLONG; + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + else + res = -ENAMETOOLONG; } } - m->count = m->size; - return err; + seq_commit(m, res); + + return res < 0 ? res : 0; } /* @@ -482,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, */ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) { - if (m->count < m->size) { - char *s = m->buf + m->count; - char *p = dentry_path(dentry, s, m->size - m->count); + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; } } - m->count = m->size; - return -1; + seq_commit(m, res); + + return res; } int seq_bitmap(struct seq_file *m, const unsigned long *bits, -- cgit v1.2.3 From 88a0a53d702b1fa39ed9e631939d2dbd92dfe486 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 28 Jul 2009 17:54:58 +0200 Subject: fs/romfs: correct error-handling code romfs_fill_super() assumes that romfs_iget() returns NULL when it fails. romfs_iget() actually returns ERR_PTR(-ve) in that case... Signed-off-by: Julia Lawall Signed-off-by: Al Viro --- fs/romfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 47f132df0c3..c117fa80d1e 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; root = romfs_iget(sb, pos); - if (!root) + if (IS_ERR(root)) goto error; sb->s_root = d_alloc_root(root); -- cgit v1.2.3 From 1ba50bbe93ebb98e83b174a85eff76af430c4e5b Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 19 Aug 2009 17:56:46 +0300 Subject: exofs: remove BKL from super operations the two places inside exofs that where taking the BKL were: exofs_put_super() - .put_super and exofs_sync_fs() - which is .sync_fs and is also called from .write_super. Now exofs_sync_fs() is protected from itself by also taking the sb_lock. exofs_put_super() directly calls exofs_sync_fs() so there is no danger between these two either. In anyway there is absolutely nothing dangerous been done inside exofs_sync_fs(). Unless there is some subtle race with the actual lifetime of the super_block in regard to .put_super and some other parts of the VFS. Which is highly unlikely. Signed-off-by: Boaz Harrosh Signed-off-by: Al Viro --- fs/exofs/super.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5ab10c3bbeb..9f500dec3b5 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait) } lock_super(sb); - lock_kernel(); sbi = sb->s_fs_info; fscb->s_nextid = cpu_to_le64(sbi->s_nextid); fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); @@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait) out: if (or) osd_end_request(or); - unlock_kernel(); unlock_super(sb); kfree(fscb); return ret; @@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; - lock_kernel(); - if (sb->s_dirt) exofs_write_super(sb); @@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb) osduld_put_device(sbi->s_dev); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } /* -- cgit v1.2.3 From 4fadd7bb20a1e7c774ed88dc703d8fbcd00ff917 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Aug 2009 23:28:06 +0200 Subject: freeze_bdev: kill bd_mount_sem Now that we have the freeze count there is not much reason for bd_mount_sem anymore. The actual freeze/thaw operations are serialized using the bd_fsfreeze_mutex, and the only other place we take bd_mount_sem is get_sb_bdev which tries to prevent mounting a filesystem while the block device is frozen. Instead of add a check for bd_fsfreeze_count and return -EBUSY if a filesystem is frozen. While that is a change in user visible behaviour a failing mount is much better for this case rather than having the mount process stuck uninterruptible for a long time. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/block_dev.c | 8 +------- fs/super.c | 9 +++++++-- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 5d1ed50bd46..22506eb4a58 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev); * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last @@ -240,7 +238,6 @@ struct super_block *freeze_bdev(struct block_device *bdev) } bdev->bd_fsfreeze_count++; - down(&bdev->bd_mount_sem); sb = get_super(bdev); if (sb && !(sb->s_flags & MS_RDONLY)) { sb->s_frozen = SB_FREEZE_WRITE; @@ -260,7 +257,6 @@ struct super_block *freeze_bdev(struct block_device *bdev) "VFS:Filesystem freeze failed\n"); sb->s_frozen = SB_UNFROZEN; drop_super(sb); - up(&bdev->bd_mount_sem); bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); return ERR_PTR(error); @@ -271,7 +267,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ + return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); @@ -321,7 +317,6 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) drop_super(sb); } - up(&bdev->bd_mount_sem); mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -430,7 +425,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS diff --git a/fs/super.c b/fs/super.c index 4906e2d8f40..1cb26a3e3df 100644 --- a/fs/super.c +++ b/fs/super.c @@ -743,9 +743,14 @@ int get_sb_bdev(struct file_system_type *fs_type, * will protect the lockfs code from trying to start a snapshot * while we are mounting */ - down(&bdev->bd_mount_sem); + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); - up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; -- cgit v1.2.3 From 4504230a71566785a05d3e6b53fa1ee071b864eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Aug 2009 23:28:35 +0200 Subject: freeze_bdev: grab active reference to frozen superblocks Currently we held s_umount while a filesystem is frozen, despite that we might return to userspace and unlock it from a different process. Instead grab an active reference to keep the file system busy and add an explicit check for frozen filesystems in remount and reject the remount instead of blocking on s_umount. Add a new get_active_super helper to super.c for use by freeze_bdev that grabs an active reference to a superblock from a given block device. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/block_dev.c | 132 +++++++++++++++++++++++++++++++-------------------------- fs/super.c | 48 ++++++++++++++++++++- 2 files changed, 119 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 22506eb4a58..9cf4b926f8e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -230,43 +230,54 @@ struct super_block *freeze_bdev(struct block_device *bdev) int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; + if (++bdev->bd_fsfreeze_count > 1) { + /* + * We don't even need to grab a reference - the first call + * to freeze_bdev grab an active reference and only the last + * thaw_bdev drops it. + */ sb = get_super(bdev); + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } - bdev->bd_fsfreeze_count++; - - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + + sb = get_active_super(bdev); + if (!sb) + goto out; + if (sb->s_flags & MS_RDONLY) { + deactivate_locked_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); } } + up_write(&sb->s_umount); + out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); @@ -280,43 +291,44 @@ EXPORT_SYMBOL(freeze_bdev); */ int thaw_bdev(struct block_device *bdev, struct super_block *sb) { - int error = 0; + int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!bdev->bd_fsfreeze_count) + goto out_unlock; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out_unlock; + + if (!sb) + goto out_unlock; + + BUG_ON(sb->s_bdev != bdev); + down_write(&sb->s_umount); + if (sb->s_flags & MS_RDONLY) + goto out_deactivate; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; } - drop_super(sb); } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + +out_deactivate: + if (sb) + deactivate_locked_super(sb); +out_unlock: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } diff --git a/fs/super.c b/fs/super.c index 1cb26a3e3df..19eb70b374b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -465,6 +465,48 @@ rescan: } EXPORT_SYMBOL(get_super); + +/** + * get_active_super - get an active reference to the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. Returns the superblock with an active + * reference and s_umount held exclusively or %NULL if none was found. + */ +struct super_block *get_active_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdev != bdev) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_write(&sb->s_umount); + if (sb->s_root) { + spin_lock(&sb_lock); + if (sb->s_count > S_BIAS) { + atomic_inc(&sb->s_active); + sb->s_count--; + spin_unlock(&sb_lock); + return sb; + } + spin_unlock(&sb_lock); + } + up_write(&sb->s_umount); + put_super(sb); + yield(); + spin_lock(&sb_lock); + } + spin_unlock(&sb_lock); + return NULL; +} struct super_block * user_get_super(dev_t dev) { @@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; int remount_rw; - + + if (sb->s_frozen != SB_UNFROZEN) + return -EBUSY; + #ifdef CONFIG_BLOCK if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) return -EACCES; #endif + if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); -- cgit v1.2.3 From 6d729e44a55547c009d7a87ea66bff21a8e0afea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 16 Aug 2009 21:05:08 +0000 Subject: fs: Make unload_nls() NULL pointer safe Most call sites of unload_nls() do: if (nls) unload_nls(nls); Check the pointer inside unload_nls() like we do in kfree() and simplify the call sites. Signed-off-by: Thomas Gleixner Cc: Steve French Cc: OGAWA Hirofumi Cc: Roman Zippel Cc: Dave Kleikamp Cc: Petr Vandrovec Cc: Anton Altaparmakov Signed-off-by: Al Viro --- fs/befs/linuxvfs.c | 7 +------ fs/cifs/cifsfs.c | 3 +-- fs/fat/inode.c | 16 ++++------------ fs/hfs/mdb.c | 6 ++---- fs/hfsplus/super.c | 6 ++---- fs/isofs/inode.c | 8 ++------ fs/jfs/super.c | 9 +++------ fs/ncpfs/inode.c | 12 ++---------- fs/ncpfs/ioctl.c | 6 ++---- fs/nls/nls_base.c | 3 ++- fs/ntfs/super.c | 10 ++++------ fs/smbfs/inode.c | 10 ++-------- 12 files changed, 27 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index dd376c124e7..33baf27fac7 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb) { kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; - - if (BEFS_SB(sb)->nls) { - unload_nls(BEFS_SB(sb)->nls); - BEFS_SB(sb)->nls = NULL; - } - + unload_nls(BEFS_SB(sb)->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d79ce2e95c2..90c5b39f031 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -185,8 +185,7 @@ out_mount_failed: cifs_sb->mountdata = NULL; } #endif - if (cifs_sb->local_nls) - unload_nls(cifs_sb->local_nls); + unload_nls(cifs_sb->local_nls); kfree(cifs_sb); } return rc; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8970d8c49bb..04629d1302f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -470,19 +470,11 @@ static void fat_put_super(struct super_block *sb) iput(sbi->fat_inode); - if (sbi->nls_disk) { - unload_nls(sbi->nls_disk); - sbi->nls_disk = NULL; - sbi->options.codepage = fat_default_codepage; - } - if (sbi->nls_io) { - unload_nls(sbi->nls_io); - sbi->nls_io = NULL; - } - if (sbi->options.iocharset != fat_default_iocharset) { + unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + + if (sbi->options.iocharset != fat_default_iocharset) kfree(sbi->options.iocharset); - sbi->options.iocharset = fat_default_iocharset; - } sb->s_fs_info = NULL; kfree(sbi); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 7b6165f25fb..8bbe03c3f6d 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb) brelse(HFS_SB(sb)->mdb_bh); brelse(HFS_SB(sb)->alt_mdb_bh); - if (HFS_SB(sb)->nls_io) - unload_nls(HFS_SB(sb)->nls_io); - if (HFS_SB(sb)->nls_disk) - unload_nls(HFS_SB(sb)->nls_disk); + unload_nls(HFS_SB(sb)->nls_io); + unload_nls(HFS_SB(sb)->nls_disk); free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); kfree(HFS_SB(sb)); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index c0759fe0855..43022f3d514 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb) iput(HFSPLUS_SB(sb).alloc_file); iput(HFSPLUS_SB(sb).hidden_dir); brelse(HFSPLUS_SB(sb).s_vhbh); - if (HFSPLUS_SB(sb).nls) - unload_nls(HFSPLUS_SB(sb).nls); + unload_nls(HFSPLUS_SB(sb).nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; @@ -464,8 +463,7 @@ out: cleanup: hfsplus_put_super(sb); - if (nls) - unload_nls(nls); + unload_nls(nls); return err; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 85f96bc651c..6b4dcd4f294 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb) #ifdef CONFIG_JOLIET lock_kernel(); - if (sbi->s_nls_iocharset) { - unload_nls(sbi->s_nls_iocharset); - sbi->s_nls_iocharset = NULL; - } + unload_nls(sbi->s_nls_iocharset); unlock_kernel(); #endif @@ -912,8 +909,7 @@ out_no_root: printk(KERN_WARNING "%s: get root inode failed\n", __func__); out_no_inode: #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset) - unload_nls(sbi->s_nls_iocharset); + unload_nls(sbi->s_nls_iocharset); #endif goto out_freesbi; out_no_read: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 37e6dcda8fc..2234c73fc57 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb) rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); - if (sbi->nls_tab) - unload_nls(sbi->nls_tab); - sbi->nls_tab = NULL; + + unload_nls(sbi->nls_tab); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); iput(sbi->direct_inode); - sbi->direct_inode = NULL; kfree(sbi); @@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, if (nls_map != (void *) -1) { /* Discard old (if remount) */ - if (sbi->nls_tab) - unload_nls(sbi->nls_tab); + unload_nls(sbi->nls_tab); sbi->nls_tab = nls_map; } return 1; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index b99ce205b1b..cf98da1be23 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb) #ifdef CONFIG_NCPFS_NLS /* unload the NLS charsets */ - if (server->nls_vol) - { - unload_nls(server->nls_vol); - server->nls_vol = NULL; - } - if (server->nls_io) - { - unload_nls(server->nls_io); - server->nls_io = NULL; - } + unload_nls(server->nls_vol); + unload_nls(server->nls_io); #endif /* CONFIG_NCPFS_NLS */ if (server->info_filp) diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 53a7ed7eb9c..0d58caf4a6e 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) oldset_io = server->nls_io; server->nls_io = iocharset; - if (oldset_cp) - unload_nls(oldset_cp); - if (oldset_io) - unload_nls(oldset_io); + unload_nls(oldset_cp); + unload_nls(oldset_io); return 0; } diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 477d37d83b3..2224b4d07bf 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset) void unload_nls(struct nls_table *nls) { - module_put(nls->owner); + if (nls) + module_put(nls->owner); } static const wchar_t charset2uni[256] = { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index abaaa1cbf8d..80b04770e8e 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -201,8 +201,7 @@ use_utf8: v, old_nls->charset); nls_map = old_nls; } else /* nls_map */ { - if (old_nls) - unload_nls(old_nls); + unload_nls(old_nls); } } else if (!strcmp(p, "utf8")) { bool val = false; @@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb) ntfs_free(vol->upcase); vol->upcase = NULL; } - if (vol->nls_map) { - unload_nls(vol->nls_map); - vol->nls_map = NULL; - } + + unload_nls(vol->nls_map); + sb->s_fs_info = NULL; kfree(vol); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1402d2d54f5..1c4c8f08997 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m) static void smb_unload_nls(struct smb_sb_info *server) { - if (server->remote_nls) { - unload_nls(server->remote_nls); - server->remote_nls = NULL; - } - if (server->local_nls) { - unload_nls(server->local_nls); - server->local_nls = NULL; - } + unload_nls(server->remote_nls); + unload_nls(server->local_nls); } static void -- cgit v1.2.3 From eca6f534e61919b28fb21aafbd1c2983deae75be Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 18 Sep 2009 13:05:45 -0700 Subject: fs: fix overflow in sys_mount() for in-kernel calls sys_mount() reads/copies a whole page for its "type" parameter. When do_mount_root() passes a kernel address that points to an object which is smaller than a whole page, copy_mount_options() will happily go past this memory object, possibly dereferencing "wild" pointers that could be in any state (hence the kmemcheck warning, which shows that parts of the next page are not even allocated). (The likelihood of something going wrong here is pretty low -- first of all this only applies to kernel calls to sys_mount(), which are mostly found in the boot code. Secondly, I guess if the page was not mapped, exact_copy_from_user() _would_ in fact handle it correctly because of its access_ok(), etc. checks.) But it is much nicer to avoid the dubious reads altogether, by stopping as soon as we find a NUL byte. Is there a good reason why we can't do something like this, using the already existing strndup_from_user()? [akpm@linux-foundation.org: make copy_mount_string() static] [AV: fix compat mount breakage, which involves undoing akpm's change above] Reported-by: Ingo Molnar Signed-off-by: Vegard Nossum Cc: Al Viro Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: al --- fs/compat.c | 24 +++++++++--------- fs/internal.h | 1 + fs/namespace.c | 77 +++++++++++++++++++++++++++++++++++----------------------- 3 files changed, 60 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 3aa48834a22..d576b552e8e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -768,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, char __user * type, unsigned long flags, void __user * data) { - unsigned long type_page; + char *kernel_type; unsigned long data_page; - unsigned long dev_page; + char *kernel_dev; char *dir_page; int retval; - retval = copy_mount_options (type, &type_page); + retval = copy_mount_string(type, &kernel_type); if (retval < 0) goto out; @@ -783,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, if (IS_ERR(dir_page)) goto out1; - retval = copy_mount_options (dev_name, &dev_page); + retval = copy_mount_string(dev_name, &kernel_dev); if (retval < 0) goto out2; - retval = copy_mount_options (data, &data_page); + retval = copy_mount_options(data, &data_page); if (retval < 0) goto out3; retval = -EINVAL; - if (type_page && data_page) { - if (!strcmp((char *)type_page, SMBFS_NAME)) { + if (kernel_type && data_page) { + if (!strcmp(kernel_type, SMBFS_NAME)) { do_smb_super_data_conv((void *)data_page); - } else if (!strcmp((char *)type_page, NCPFS_NAME)) { + } else if (!strcmp(kernel_type, NCPFS_NAME)) { do_ncp_super_data_conv((void *)data_page); - } else if (!strcmp((char *)type_page, NFS4_NAME)) { + } else if (!strcmp(kernel_type, NFS4_NAME)) { if (do_nfs4_super_data_conv((void *) data_page)) goto out4; } } - retval = do_mount((char*)dev_page, dir_page, (char*)type_page, + retval = do_mount(kernel_dev, dir_page, kernel_type, flags, (void*)data_page); out4: free_page(data_page); out3: - free_page(dev_page); + kfree(kernel_dev); out2: putname(dir_page); out1: - free_page(type_page); + kfree(kernel_type); out: return retval; } diff --git a/fs/internal.h b/fs/internal.h index d55ef562f0b..515175b8b72 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *); * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); +extern int copy_mount_string(const void __user *, char **); extern void free_vfsmnt(struct vfsmount *); extern struct vfsmount *alloc_vfsmnt(const char *); diff --git a/fs/namespace.c b/fs/namespace.c index 7230787d18b..bdc3cb4fd22 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags, { struct vfsmount *mnt; - if (!type || !memchr(type, 0, PAGE_SIZE)) + if (!type) return -EINVAL; /* we need capabilities... */ @@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } +int copy_mount_string(const void __user *data, char **where) +{ + char *tmp; + + if (!data) { + *where = NULL; + return 0; + } + + tmp = strndup_user(data, PAGE_SIZE); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + *where = tmp; + return 0; +} + /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). @@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; - if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) - return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; @@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { - int retval; + int ret; + char *kernel_type; + char *kernel_dir; + char *kernel_dev; unsigned long data_page; - unsigned long type_page; - unsigned long dev_page; - char *dir_page; - retval = copy_mount_options(type, &type_page); - if (retval < 0) - return retval; + ret = copy_mount_string(type, &kernel_type); + if (ret < 0) + goto out_type; - dir_page = getname(dir_name); - retval = PTR_ERR(dir_page); - if (IS_ERR(dir_page)) - goto out1; + kernel_dir = getname(dir_name); + if (IS_ERR(kernel_dir)) { + ret = PTR_ERR(kernel_dir); + goto out_dir; + } - retval = copy_mount_options(dev_name, &dev_page); - if (retval < 0) - goto out2; + ret = copy_mount_string(dev_name, &kernel_dev); + if (ret < 0) + goto out_dev; - retval = copy_mount_options(data, &data_page); - if (retval < 0) - goto out3; + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; - retval = do_mount((char *)dev_page, dir_page, (char *)type_page, - flags, (void *)data_page); - free_page(data_page); + ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, + (void *) data_page); -out3: - free_page(dev_page); -out2: - putname(dir_page); -out1: - free_page(type_page); - return retval; + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + putname(kernel_dir); +out_dir: + kfree(kernel_type); +out_type: + return ret; } /* -- cgit v1.2.3 From 25d9e2d15286281ec834b829a4aaf8969011f1cd Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Fri, 21 Aug 2009 02:35:05 +1000 Subject: truncate: new helpers Introduce new truncate helpers truncate_pagecache and inode_newsize_ok. vmtruncate is also consolidated from mm/memory.c and mm/nommu.c and into mm/truncate.c. Reviewed-by: Christoph Hellwig Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/attr.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 9fe1b1bd30a..96d394bdadd 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,7 +18,7 @@ /* Taken over from the old code... */ /* POSIX UID/GID verification for setting inode attributes. */ -int inode_change_ok(struct inode *inode, struct iattr *attr) +int inode_change_ok(const struct inode *inode, struct iattr *attr) { int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; @@ -60,9 +60,51 @@ fine: error: return retval; } - EXPORT_SYMBOL(inode_change_ok); +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + * + * inode_newsize_ok must be called with i_mutex held. + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + int inode_setattr(struct inode * inode, struct iattr * attr) { unsigned int ia_valid = attr->ia_valid; -- cgit v1.2.3 From c08d3b0e33edce28e9cfa7b64f7fe5bdeeb29248 Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Fri, 21 Aug 2009 02:35:06 +1000 Subject: truncate: use new helpers Update some fs code to make use of new helper functions introduced in the previous patch. Should be no significant change in behaviour (except CIFS now calls send_sig under i_lock, via inode_newsize_ok). Reviewed-by: Christoph Hellwig Acked-by: Miklos Szeredi Cc: linux-nfs@vger.kernel.org Cc: Trond.Myklebust@netapp.com Cc: linux-cifs-client@lists.samba.org Cc: sfrench@samba.org Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/buffer.c | 10 ++-------- fs/cifs/inode.c | 53 ++++++++++---------------------------------------- fs/fuse/dir.c | 14 ++++--------- fs/fuse/fuse_i.h | 2 -- fs/fuse/inode.c | 11 +---------- fs/nfs/inode.c | 54 +++++++++++++-------------------------------------- fs/ramfs/file-nommu.c | 18 +++++------------ 7 files changed, 35 insertions(+), 127 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 209f7f15f5f..24afd7422ae 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2239,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - unsigned long limit; int err; - err = -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && size > (loff_t)limit) { - send_sig(SIGXFSZ, current, 0); - goto out; - } - if (size > inode->i_sb->s_maxbytes) + err = inode_newsize_ok(inode, size); + if (err) goto out; err = pagecache_write_begin(NULL, mapping, size, 0, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1f09c761931..5e2492535da 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static int cifs_vmtruncate(struct inode *inode, loff_t offset) { - struct address_space *mapping = inode->i_mapping; - unsigned long limit; + loff_t oldsize; + int err; spin_lock(&inode->i_lock); - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) { - spin_unlock(&inode->i_lock); - goto out_busy; - } - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); - /* - * unmap_mapping_range is called twice, first simply for efficiency - * so that truncate_inode_pages does fewer single-page unmaps. However - * after this first call, and before truncate_inode_pages finishes, - * it is possible for private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second unmap_mapping_range - * call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) { + err = inode_newsize_ok(inode, offset); + if (err) { spin_unlock(&inode->i_lock); - goto out_sig; - } - if (offset > inode->i_sb->s_maxbytes) { - spin_unlock(&inode->i_lock); - goto out_big; + goto out; } + + oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); -out_truncate: + truncate_pagecache(inode, oldsize, offset); if (inode->i_op->truncate) inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -out_busy: - return -ETXTBSY; +out: + return err; } static int diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e703654e7f4..992f6c9410b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, return 0; if (attr->ia_valid & ATTR_SIZE) { - unsigned long limit; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } + err = inode_newsize_ok(inode, attr->ia_size); + if (err) + return err; is_truncate = true; } @@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { - if (outarg.attr.size < oldsize) - fuse_truncate(inode->i_mapping, outarg.attr.size); + truncate_pagecache(inode, oldsize, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fc9c79feb5f..01cc462ff45 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -606,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid); -void fuse_truncate(struct address_space *mapping, loff_t offset); - /** * Initialize the client device */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6da947daabd..1a822ce2b24 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -140,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } -void fuse_truncate(struct address_space *mapping, loff_t offset) -{ - /* See vmtruncate() */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); -} - void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid) { @@ -205,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, spin_unlock(&fc->lock); if (S_ISREG(inode->i_mode) && oldsize != attr->size) { - if (attr->size < oldsize) - fuse_truncate(inode->i_mapping, attr->size); + truncate_pagecache(inode, oldsize, attr->size); invalidate_inode_pages2(inode->i_mapping); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 060022b4651..faa091865ad 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -458,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { - if (i_size_read(inode) < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - spin_lock(&inode->i_lock); - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); - } else { - struct address_space *mapping = inode->i_mapping; + loff_t oldsize; + int err; - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - spin_lock(&inode->i_lock); - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); + err = inode_newsize_ok(inode, offset); + if (err) + goto out; - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; + spin_lock(&inode->i_lock); + oldsize = inode->i_size; + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, oldsize, offset); +out: + return err; } /** diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 11f0c06316d..32fae4040eb 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); if (unlikely(order >= MAX_ORDER)) - goto too_big; + return -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && newsize > limit) - goto fsize_exceeded; - - if (newsize > inode->i_sb->s_maxbytes) - goto too_big; + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; i_size_write(inode, newsize); @@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return 0; - fsize_exceeded: - send_sig(SIGXFSZ, current, 0); - too_big: - return -EFBIG; - - add_error: +add_error: while (loop < npages) __free_page(pages + loop++); return ret; -- cgit v1.2.3