aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 12:09:22 +0300
committerArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 12:09:22 +0300
commit7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0 (patch)
treeb064d077928cf224660ab1e1841cdab2c9fd8b08 /fs
parente055f7e873d900925c222cf2d1ec955af4a9ca90 (diff)
parentebc79c4f8da0f92efa968e0328f32334a2ce80cf (diff)
downloadlinux-linaro-7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0.tar.gz
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into linux-next
Conflicts: fs/ubifs/super.c Merge the upstream tree in order to resolve a conflict with the per-bdi writeback changes from the linux-2.6-block tree.
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_inode.c126
-rw-r--r--fs/9p/vfs_super.c39
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/binfmt_elf.c28
-rw-r--r--fs/binfmt_flat.c17
-rw-r--r--fs/block_dev.c40
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/ctree.c121
-rw-r--r--fs/btrfs/ctree.h27
-rw-r--r--fs/btrfs/disk-io.c17
-rw-r--r--fs/btrfs/extent-tree.c533
-rw-r--r--fs/btrfs/free-space-cache.c1058
-rw-r--r--fs/btrfs/free-space-cache.h8
-rw-r--r--fs/btrfs/inode.c26
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/transaction.c56
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c50
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/char_dev.c40
-rw-r--r--fs/cifs/CHANGES10
-rw-r--r--fs/cifs/README25
-rw-r--r--fs/cifs/cifs_dfs_ref.c12
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_unicode.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifssmb.c316
-rw-r--r--fs/cifs/connect.c112
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c43
-rw-r--r--fs/cifs/inode.c15
-rw-r--r--fs/cifs/transport.c17
-rw-r--r--fs/compat.c17
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/ecryptfs/keystore.c13
-rw-r--r--fs/exec.c67
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext3/Kconfig32
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/dir.c3
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c60
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c40
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/file.c55
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fs-writeback.c1104
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c83
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c106
-rw-r--r--fs/gfs2/aops.c39
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/gfs2/glock.c138
-rw-r--r--fs/gfs2/glock.h3
-rw-r--r--fs/gfs2/glops.c21
-rw-r--r--fs/gfs2/incore.h17
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/ops_fstype.c66
-rw-r--r--fs/gfs2/ops_inode.c82
-rw-r--r--fs/gfs2/rgrp.c111
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c86
-rw-r--r--fs/gfs2/super.h9
-rw-r--r--fs/gfs2/sys.c49
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)425
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)54
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/inode.c44
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c54
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c77
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jfs/acl.c11
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/host.c14
-rw-r--r--fs/lockd/mon.c44
-rw-r--r--fs/locks.c4
-rw-r--r--fs/namei.c110
-rw-r--r--fs/namespace.c3
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/client.c16
-rw-r--r--fs/nfs/direct.c23
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c49
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c100
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4namespace.c24
-rw-r--r--fs/nfs/nfs4proc.c40
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfs4xdr.c1460
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/super.c453
-rw-r--r--fs/nfs/write.c98
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfsctl.c21
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/bmap.c151
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btree.c625
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c42
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c3
-rw-r--r--fs/nilfs2/ioctl.c26
-rw-r--r--fs/nilfs2/mdt.c42
-rw-r--r--fs/nilfs2/mdt.h3
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c23
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c102
-rw-r--r--fs/nilfs2/the_nilfs.c19
-rw-r--r--fs/nilfs2/the_nilfs.h45
-rw-r--r--fs/notify/Kconfig12
-rw-r--r--fs/notify/dnotify/Kconfig2
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/notify/inotify/Kconfig2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c46
-rw-r--r--fs/notify/inotify/inotify_user.c271
-rw-r--r--fs/notify/notification.c30
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ocfs2/alloc.c49
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/dcache.c46
-rw-r--r--fs/ocfs2/dcache.h3
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/file.c54
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/journal.h19
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h1
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_global.c144
-rw-r--r--fs/ocfs2/quota_local.c110
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/super.c34
-rw-r--r--fs/ocfs2/xattr.c3
-rw-r--r--fs/open.c12
-rw-r--r--fs/partitions/check.c14
-rw-r--r--fs/proc/base.c46
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/quota/dquot.c7
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/select.c1
-rw-r--r--fs/splice.c30
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c85
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c32
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/udf/super.c12
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_attr.c8
-rw-r--r--fs/xfs/xfs_bmap.c4
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c46
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c280
-rw-r--r--fs/xfs/xfs_inode.c18
-rw-r--r--fs/xfs/xfs_inode.h25
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c21
292 files changed, 9459 insertions, 6600 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fe..f7003cfac63 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
* Return 0 upon success, -ERRNO upon failure.
*/
-static int v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
{
char *options;
substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
v9ses->debug = 0;
v9ses->cache = 0;
- if (!v9ses->options)
+ if (!opts)
return 0;
- options = kstrdup(v9ses->options, GFP_KERNEL);
+ options = kstrdup(opts, GFP_KERNEL);
if (!options) {
P9_DPRINTK(P9_DEBUG_ERROR,
"failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->uid = ~0;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
- if (data) {
- v9ses->options = kstrdup(data, GFP_KERNEL);
- if (!v9ses->options) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- retval = -ENOMEM;
- goto error;
- }
- }
- rc = v9fs_parse_options(v9ses);
+ rc = v9fs_parse_options(v9ses, data);
if (rc < 0) {
retval = rc;
goto error;
}
- v9ses->clnt = p9_client_create(dev_name, v9ses->options);
-
+ v9ses->clnt = p9_client_create(dev_name, data);
if (IS_ERR(v9ses->clnt)) {
retval = PTR_ERR(v9ses->clnt);
v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
__putname(v9ses->uname);
__putname(v9ses->aname);
- kfree(v9ses->options);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d56719299..38762bf102a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
unsigned int afid;
unsigned int cache;
- char *options; /* copy of mount options */
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
unsigned int maxdata; /* max data for client interface */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9..06a223d50a8 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
/**
* v9fs_blank_wstat - helper function to setup a 9P stat structure
- * @v9ses: 9P session info (for determining extended mode)
* @wstat: structure to initialize
*
*/
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_get_inode(struct super_block *sb, int mode)
{
+ int err;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
inode = new_inode(sb);
- if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
- inode->i_rdev = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->a_ops = &v9fs_addr_operations;
-
- switch (mode & S_IFMT) {
- case S_IFIFO:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFSOCK:
- if (!v9fs_extended(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "special files without extended mode\n");
- return ERR_PTR(-EINVAL);
- }
- init_special_inode(inode, inode->i_mode,
- inode->i_rdev);
- break;
- case S_IFREG:
- inode->i_op = &v9fs_file_inode_operations;
- inode->i_fop = &v9fs_file_operations;
- break;
- case S_IFLNK:
- if (!v9fs_extended(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "extended modes used w/o 9P2000.u\n");
- return ERR_PTR(-EINVAL);
- }
- inode->i_op = &v9fs_symlink_inode_operations;
- break;
- case S_IFDIR:
- inc_nlink(inode);
- if (v9fs_extended(v9ses))
- inode->i_op = &v9fs_dir_inode_operations_ext;
- else
- inode->i_op = &v9fs_dir_inode_operations;
- inode->i_fop = &v9fs_dir_operations;
- break;
- default:
- P9_DPRINTK(P9_DEBUG_ERROR,
- "BAD mode 0x%x S_IFMT 0x%x\n",
- mode, mode & S_IFMT);
- return ERR_PTR(-EINVAL);
- }
- } else {
+ if (!inode) {
P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
return ERR_PTR(-ENOMEM);
}
+
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_blocks = 0;
+ inode->i_rdev = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mapping->a_ops = &v9fs_addr_operations;
+
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFSOCK:
+ if (!v9fs_extended(v9ses)) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "special files without extended mode\n");
+ err = -EINVAL;
+ goto error;
+ }
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ case S_IFREG:
+ inode->i_op = &v9fs_file_inode_operations;
+ inode->i_fop = &v9fs_file_operations;
+ break;
+ case S_IFLNK:
+ if (!v9fs_extended(v9ses)) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "extended modes used w/o 9P2000.u\n");
+ err = -EINVAL;
+ goto error;
+ }
+ inode->i_op = &v9fs_symlink_inode_operations;
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ if (v9fs_extended(v9ses))
+ inode->i_op = &v9fs_dir_inode_operations_ext;
+ else
+ inode->i_op = &v9fs_dir_inode_operations;
+ inode->i_fop = &v9fs_dir_operations;
+ break;
+ default:
+ P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+ mode, mode & S_IFMT);
+ err = -EINVAL;
+ goto error;
+ }
+
return inode;
+
+error:
+ iput(inode);
+ return ERR_PTR(err);
}
/*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
ret = NULL;
st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- err = PTR_ERR(st);
- st = NULL;
- goto error;
- }
+ if (IS_ERR(st))
+ return ERR_CAST(st);
umode = p9mode2unixmode(v9ses, st->mode);
ret = v9fs_get_inode(sb, umode);
if (IS_ERR(ret)) {
err = PTR_ERR(ret);
- ret = NULL;
goto error;
}
v9fs_stat2inode(st, ret, sb);
ret->i_ino = v9fs_qid2ino(&st->qid);
+ p9stat_free(st);
kfree(st);
return ret;
error:
+ p9stat_free(st);
kfree(st);
- if (ret)
- iput(ret);
-
return ERR_PTR(err);
}
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
* @v9ses: session information
* @dir: directory that dentry is being created in
* @dentry: dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
* @perm: create permissions
* @mode: open mode
- * @extension: 9p2000.u extension string to support devices, etc.
*
*/
static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
dentry->d_op = &v9fs_dentry_operations;
d_instantiate(dentry, inode);
- v9fs_fid_add(dentry, fid);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+
return ofid;
error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0..8961f1a8f66 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
static void
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
- int flags)
+ int flags, void *data)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
MS_NOATIME;
+
+ save_mount_options(sb, data);
}
/**
@@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
struct v9fs_session_info *v9ses = NULL;
struct p9_wstat *st = NULL;
int mode = S_IRWXUGO | S_ISVTX;
- uid_t uid = current_fsuid();
- gid_t gid = current_fsgid();
struct p9_fid *fid;
int retval = 0;
P9_DPRINTK(P9_DEBUG_VFS, " \n");
- st = NULL;
v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
if (!v9ses)
return -ENOMEM;
@@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto free_stat;
}
- v9fs_fill_super(sb, v9ses, flags);
+ v9fs_fill_super(sb, v9ses, flags, data);
inode = v9fs_get_inode(sb, S_IFDIR | mode);
if (IS_ERR(inode)) {
@@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
goto release_sb;
}
- inode->i_uid = uid;
- inode->i_gid = gid;
-
root = d_alloc_root(inode);
if (!root) {
iput(inode);
@@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
simple_set_mnt(mnt, sb);
return 0;
-release_sb:
- deactivate_locked_super(sb);
-
free_stat:
+ p9stat_free(st);
kfree(st);
clunk_fid:
@@ -185,7 +179,12 @@ clunk_fid:
close_session:
v9fs_session_close(v9ses);
kfree(v9ses);
+ return retval;
+release_sb:
+ p9stat_free(st);
+ kfree(st);
+ deactivate_locked_super(sb);
return retval;
}
@@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s)
v9fs_session_close(v9ses);
kfree(v9ses);
+ s->s_fs_info = NULL;
P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
}
-/**
- * v9fs_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- *
- */
-
-static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
- struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-
- seq_printf(m, "%s", v9ses->options);
- return 0;
-}
-
static void
v9fs_umount_begin(struct super_block *sb)
{
@@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
static const struct super_operations v9fs_super_ops = {
.statfs = simple_statfs,
.clear_inode = v9fs_clear_inode,
- .show_options = v9fs_show_options,
+ .show_options = generic_show_options,
.umount_begin = v9fs_umount_begin,
};
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d9..455aa207e67 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
endif # BLOCK
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e..681c2a7b013 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
inode = page->mapping->host;
- ASSERT(file != NULL);
- key = file->private_data;
- ASSERT(key != NULL);
+ if (file) {
+ key = file->private_data;
+ ASSERT(key != NULL);
+ } else {
+ key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error_nokey;
+ }
+ }
_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
unlock_page(page);
}
+ if (!file)
+ key_put(key);
_leave(" = 0");
return 0;
error:
SetPageError(page);
unlock_page(page);
+ if (!file)
+ key_put(key);
+error_nokey:
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff008..c63a3c8beb7 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_writepages = 1,
.range_cyclic = 1,
};
int ret;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f01..3da18d45348 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
}
/* Update the expiry counter if fs is busy */
- if (!may_umount_tree(mnt)) {
+ if (!may_umount_tree(path.mnt)) {
struct autofs_info *ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4b..7c1e65d5487 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
}
}
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out_close;
- }
+ if (last_bss > elf_bss) {
+ /*
+ * Now fill out the bss section. First pad the last page up
+ * to the page boundary, and then perform a mmap to make sure
+ * that there are zero-mapped pages up to and including the
+ * last bss page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out_close;
+ }
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+ /* What we have mapped so far */
+ elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
- /* Map the last of the bss segment */
- if (last_bss > elf_bss) {
+ /* Map the last of the bss segment */
down_write(&current->mm->mmap_sem);
error = do_brk(elf_bss, last_bss - elf_bss);
up_write(&current->mm->mmap_sem);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f131..e92f229e3c6 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
if (IS_ERR(bprm.file))
return res;
+ bprm.cred = prepare_exec_creds();
+ res = -ENOMEM;
+ if (!bprm.cred)
+ goto out;
+
res = prepare_binprm(&bprm);
if (res <= (unsigned long)-4096)
res = load_flat_file(&bprm, libs, id, NULL);
- if (bprm.file) {
- allow_write_access(bprm.file);
- fput(bprm.file);
- bprm.file = NULL;
- }
+
+ abort_creds(bprm.cred);
+
+out:
+ allow_write_access(bprm.file);
+ fput(bprm.file);
+
return(res);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a32..71e7e03ac34 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
{
struct bdev_inode *bdi = BDEV_I(inode);
- bdi->bdev.bd_inode_backing_dev_info = NULL;
kmem_cache_free(bdev_cachep, bdi);
}
@@ -564,6 +563,16 @@ struct block_device *bdget(dev_t dev)
EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev: Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+ atomic_inc(&bdev->bd_inode->i_count);
+ return bdev;
+}
+
long nr_blockdev_pages(void)
{
struct block_device *bdev;
@@ -1395,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
}
/*
+ * Write data to the block device. Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
+/*
* Try to release a page associated with block device when the system
* is under memory pressure.
*/
@@ -1426,7 +1462,7 @@ const struct file_operations def_blk_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write_nolock,
+ .aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = block_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a12..019e8af449a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
* list
*/
if (worker->idle) {
- spin_lock_irqsave(&worker->workers->lock, flags);
+ spin_lock(&worker->workers->lock);
worker->idle = 0;
list_move_tail(&worker->worker_list,
&worker->workers->worker_list);
- spin_unlock_irqrestore(&worker->workers->lock, flags);
+ spin_unlock(&worker->workers->lock);
}
if (!worker->working) {
wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e9..3fdcc0512d3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
btrfs_disk_key_to_cpu(&k1, disk);
- if (k1.objectid > k2->objectid)
- return 1;
- if (k1.objectid < k2->objectid)
- return -1;
- if (k1.type > k2->type)
- return 1;
- if (k1.type < k2->type)
- return -1;
- if (k1.offset > k2->offset)
- return 1;
- if (k1.offset < k2->offset)
- return -1;
- return 0;
+ return btrfs_comp_cpu_keys(&k1, k2);
}
/*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- if (btrfs_header_nritems(mid) > 2)
- return 0;
-
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
struct extent_buffer *b;
int slot;
int ret;
+ int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
p->locks[level] = 1;
if (cow) {
- int wret;
-
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
btrfs_set_path_blocking(p);
- wret = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b);
- if (wret) {
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
+ if (err) {
free_extent_buffer(b);
- ret = wret;
+ ret = err;
goto done;
}
}
@@ -1793,41 +1777,45 @@ cow_done:
ret = bin_search(b, key, level, &slot);
if (level != 0) {
- if (ret && slot > 0)
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
slot -= 1;
+ }
p->slots[level] = slot;
- ret = setup_nodes_for_search(trans, root, p, b, level,
+ err = setup_nodes_for_search(trans, root, p, b, level,
ins_len);
- if (ret == -EAGAIN)
+ if (err == -EAGAIN)
goto again;
- else if (ret)
+ if (err) {
+ ret = err;
goto done;
+ }
b = p->nodes[level];
slot = p->slots[level];
unlock_up(p, level, lowest_unlock);
- /* this is only true while dropping a snapshot */
if (level == lowest_level) {
- ret = 0;
+ if (dec)
+ p->slots[level]++;
goto done;
}
- ret = read_block_for_search(trans, root, p,
+ err = read_block_for_search(trans, root, p,
&b, level, slot, key);
- if (ret == -EAGAIN)
+ if (err == -EAGAIN)
goto again;
-
- if (ret == -EIO)
+ if (err) {
+ ret = err;
goto done;
+ }
if (!p->skip_locking) {
- int lret;
-
btrfs_clear_path_blocking(p, NULL);
- lret = btrfs_try_spin_lock(b);
+ err = btrfs_try_spin_lock(b);
- if (!lret) {
+ if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(root, b) < ins_len) {
- int sret;
-
btrfs_set_path_blocking(p);
- sret = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
+ err = split_leaf(trans, root, key,
+ p, ins_len, ret == 0);
btrfs_clear_path_blocking(p, NULL);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
+ BUG_ON(err > 0);
+ if (err) {
+ ret = err;
goto done;
}
}
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *key, int lowest_level,
+ struct btrfs_key *key, int level,
int cache_only, u64 min_trans)
{
- int level = lowest_level;
int slot;
struct extent_buffer *c;
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
c = path->nodes[level];
next:
if (slot >= btrfs_header_nritems(c)) {
- level++;
- if (level == BTRFS_MAX_LEVEL)
+ int ret;
+ int orig_lowest;
+ struct btrfs_key cur_key;
+ if (level + 1 >= BTRFS_MAX_LEVEL ||
+ !path->nodes[level + 1])
return 1;
- continue;
+
+ if (path->locks[level + 1]) {
+ level++;
+ continue;
+ }
+
+ slot = btrfs_header_nritems(c) - 1;
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, &cur_key, slot);
+ else
+ btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+ orig_lowest = path->lowest_level;
+ btrfs_release_path(root, path);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &cur_key, path,
+ 0, 0);
+ path->lowest_level = orig_lowest;
+ if (ret < 0)
+ return ret;
+
+ c = path->nodes[level];
+ slot = path->slots[level];
+ if (ret == 0)
+ slot++;
+ goto next;
}
+
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else {
@@ -4146,7 +4160,8 @@ again:
* advance the path if there are now more items available.
*/
if (nritems > 0 && path->slots[0] < nritems - 1) {
- path->slots[0]++;
+ if (ret == 0)
+ path->slots[0]++;
ret = 0;
goto done;
}
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.type == type)
- return 0;
if (found_key.objectid < min_objectid)
break;
+ if (found_key.type == type)
+ return 0;
if (found_key.objectid == min_objectid &&
found_key.type < type)
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a87383871..837435ce84c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
struct btrfs_extent_inline_ref {
u8 type;
- u64 offset;
+ __le64 offset;
} __attribute__ ((__packed__));
/* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
struct list_head block_groups;
spinlock_t lock;
struct rw_semaphore groups_sem;
+ atomic_t caching_threads;
};
/*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* if this cluster simply points at a bitmap in the block group */
+ bool points_to_bitmap;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO = 0,
+ BTRFS_CACHE_STARTED = 1,
+ BTRFS_CACHE_FINISHED = 2,
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
+ struct btrfs_fs_info *fs_info;
spinlock_t lock;
- struct mutex cache_mutex;
u64 pinned;
u64 reserved;
u64 flags;
- int cached;
+ u64 sectorsize;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
int ro;
int dirty;
+ /* cache tracking stuff */
+ wait_queue_head_t caching_q;
+ int cached;
+
struct btrfs_space_info *space_info;
/* free space cache stuff */
spinlock_t tree_lock;
- struct rb_root free_space_bytes;
struct rb_root free_space_offset;
+ u64 free_space;
/* block group cache stuff */
struct rb_node cache_node;
@@ -808,6 +825,7 @@ struct btrfs_fs_info {
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
+ struct rw_semaphore extent_commit_sem;
/*
* this protects the ordered operations list only while we are
@@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7..8b819279001 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
+ bdi->name = "btrfs";
bdi->capabilities = BDI_CAP_MAP_COPY;
err = bdi_init(bdi);
if (err)
@@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
/*
* we set the i_size on the btree inode to the max possible int.
@@ -1639,6 +1641,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->tree_reloc_mutex);
+ init_rwsem(&fs_info->extent_commit_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1799,6 +1802,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
BUG_ON(!chunk_root->node);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+ sb->s_id);
+ goto fail_chunk_root;
+ }
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1826,6 +1834,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
blocksize, generation);
if (!tree_root->node)
goto fail_chunk_root;
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+ sb->s_id);
+ goto fail_tree_root;
+ }
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
@@ -2322,6 +2335,9 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ fs_info->closing = 2;
+ smp_mb();
+
if (fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
(unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2359,7 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(root->fs_info->csum_root->commit_root);
btrfs_free_block_groups(root->fs_info);
+ btrfs_free_pinned_extents(root->fs_info);
del_fs_roots(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d4..535f85ba104 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
#include <linux/rcupdate.h>
+#include <linux/kthread.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED;
+}
+
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
{
return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
}
/*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset. Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+ u64 start, end, last = 0;
+ int ret;
+
+ while (1) {
+ ret = find_first_extent_bit(&info->pinned_extents, last,
+ &start, &end,
+ EXTENT_LOCKED|EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ clear_extent_bits(&info->pinned_extents, start, end,
+ EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+ last = end+1;
+ }
+}
+
+static int remove_sb_from_cache(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+ cache->key.objectid, bytenr,
+ 0, &logical, &nr, &stripe_len);
+ BUG_ON(ret);
+ while (nr--) {
+ try_lock_extent(&fs_info->pinned_extents,
+ logical[nr],
+ logical[nr] + stripe_len - 1, GFP_NOFS);
+ }
+ kfree(logical);
+ }
+
+ return 0;
+}
+
+/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *info, u64 start, u64 end)
{
- u64 extent_start, extent_end, size;
+ u64 extent_start, extent_end, size, total_added = 0;
int ret;
while (start < end) {
ret = find_first_extent_bit(&info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY|EXTENT_LOCKED);
if (ret)
break;
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
+ total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
BUG_ON(ret);
@@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
if (start < end) {
size = end - start;
+ total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
BUG_ON(ret);
}
- return 0;
-}
-
-static int remove_sb_from_cache(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache)
-{
- u64 bytenr;
- u64 *logical;
- int stripe_len;
- int i, nr, ret;
-
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
- cache->key.objectid, bytenr, 0,
- &logical, &nr, &stripe_len);
- BUG_ON(ret);
- while (nr--) {
- btrfs_remove_free_space(cache, logical[nr],
- stripe_len);
- }
- kfree(logical);
- }
- return 0;
+ return total_added;
}
-static int cache_block_group(struct btrfs_root *root,
- struct btrfs_block_group_cache *block_group)
+static int caching_kthread(void *data)
{
+ struct btrfs_block_group_cache *block_group = data;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ u64 last = 0;
struct btrfs_path *path;
int ret = 0;
struct btrfs_key key;
struct extent_buffer *leaf;
int slot;
- u64 last;
-
- if (!block_group)
- return 0;
-
- root = root->fs_info->extent_root;
+ u64 total_found = 0;
- if (block_group->cached)
- return 0;
+ BUG_ON(!fs_info);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ atomic_inc(&block_group->space_info->caching_threads);
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
/*
- * we get into deadlocks with paths held by callers of this function.
- * since the alloc_mutex is protecting things right now, just
- * skip the locking here
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
*/
path->skip_locking = 1;
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ path->search_commit_root = 1;
+ path->reada = 2;
+
key.objectid = last;
key.offset = 0;
btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+again:
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->extent_commit_sem);
+
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto err;
while (1) {
+ smp_mb();
+ if (block_group->fs_info->closing > 1) {
+ last = (u64)-1;
+ break;
+ }
+
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(fs_info->extent_root, path);
if (ret < 0)
goto err;
- if (ret == 0)
- continue;
- else
+ else if (ret)
break;
+
+ if (need_resched() ||
+ btrfs_transaction_in_commit(fs_info)) {
+ leaf = path->nodes[0];
+
+ /* this shouldn't happen, but if the
+ * leaf is empty just move on.
+ */
+ if (btrfs_header_nritems(leaf) == 0)
+ break;
+ /*
+ * we need to copy the key out so that
+ * we are sure the next search advances
+ * us forward in the btree.
+ */
+ btrfs_item_key_to_cpu(leaf, &key, 0);
+ btrfs_release_path(fs_info->extent_root, path);
+ up_read(&fs_info->extent_commit_sem);
+ schedule_timeout(1);
+ goto again;
+ }
+
+ continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid < block_group->key.objectid)
@@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
break;
if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
- add_new_free_space(block_group, root->fs_info, last,
- key.objectid);
-
+ total_found += add_new_free_space(block_group,
+ fs_info, last,
+ key.objectid);
last = key.objectid + key.offset;
}
+
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&block_group->caching_q);
+ }
next:
path->slots[0]++;
}
+ ret = 0;
- add_new_free_space(block_group, root->fs_info, last,
- block_group->key.objectid +
- block_group->key.offset);
+ total_found += add_new_free_space(block_group, fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
+
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
- block_group->cached = 1;
- remove_sb_from_cache(root, block_group);
- ret = 0;
err:
btrfs_free_path(path);
+ up_read(&fs_info->extent_commit_sem);
+ atomic_dec(&block_group->space_info->caching_threads);
+ wake_up(&block_group->caching_q);
+
+ return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+ struct task_struct *tsk;
+ int ret = 0;
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ return ret;
+ }
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
+ tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ cache->key.objectid);
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
+ printk(KERN_ERR "error running thread %d\n", ret);
+ BUG();
+ }
+
return ret;
}
@@ -1408,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
}
#endif
@@ -2387,13 +2491,29 @@ fail:
}
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct rb_node *node;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ atomic_inc(&cache->count);
+ } else
+ cache = NULL;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ return cache;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_block_group_cache *cache, *entry;
- struct rb_node *n;
+ struct btrfs_block_group_cache *cache;
int err = 0;
- int werr = 0;
struct btrfs_path *path;
u64 last = 0;
@@ -2402,39 +2522,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
while (1) {
- cache = NULL;
- spin_lock(&root->fs_info->block_group_cache_lock);
- for (n = rb_first(&root->fs_info->block_group_cache_tree);
- n; n = rb_next(n)) {
- entry = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- if (entry->dirty) {
- cache = entry;
- break;
- }
+ if (last == 0) {
+ err = btrfs_run_delayed_refs(trans, root,
+ (unsigned long)-1);
+ BUG_ON(err);
}
- spin_unlock(&root->fs_info->block_group_cache_lock);
- if (!cache)
- break;
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ while (cache) {
+ if (cache->dirty)
+ break;
+ cache = next_block_group(root, cache);
+ }
+ if (!cache) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
cache->dirty = 0;
- last += cache->key.offset;
+ last = cache->key.objectid + cache->key.offset;
- err = write_one_cache_group(trans, root,
- path, cache);
- /*
- * if we fail to write the cache group, we want
- * to keep it marked dirty in hopes that a later
- * write will work
- */
- if (err) {
- werr = err;
- continue;
- }
+ err = write_one_cache_group(trans, root, path, cache);
+ BUG_ON(err);
+ btrfs_put_block_group(cache);
}
+
btrfs_free_path(path);
- return werr;
+ return 0;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2600,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->force_alloc = 0;
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
+ atomic_set(&found->caching_threads, 0);
return 0;
}
@@ -2947,13 +3064,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
- if (pin) {
+ if (pin)
set_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
- } else {
- clear_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + num - 1, GFP_NOFS);
- }
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3082,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
fs_info->total_pinned += len;
} else {
+ int unpin = 0;
+
+ /*
+ * in order to not race with the block group caching, we
+ * only want to unpin the extent if we are cached. If
+ * we aren't cached, we want to start async caching this
+ * block group so we can free the extent the next time
+ * around.
+ */
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
+ unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+ if (likely(unpin)) {
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ fs_info->total_pinned -= len;
+ }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- fs_info->total_pinned -= len;
- if (cache->cached)
+
+ if (likely(unpin))
+ clear_extent_dirty(&fs_info->pinned_extents,
+ bytenr, bytenr + len -1,
+ GFP_NOFS);
+ else
+ cache_block_group(cache);
+
+ if (unpin)
btrfs_add_free_space(cache, bytenr, len);
}
btrfs_put_block_group(cache);
@@ -3030,6 +3163,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
&start, &end, EXTENT_DIRTY);
if (ret)
break;
+
set_extent_dirty(copy, start, end, GFP_NOFS);
last = end + 1;
}
@@ -3058,6 +3192,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
+
return ret;
}
@@ -3436,6 +3571,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
}
/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once. So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes. Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ u64 num_bytes)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (block_group_cache_done(cache)) {
+ finish_wait(&cache->caching_q, &wait);
+ return 0;
+ }
+ schedule();
+ finish_wait(&cache->caching_q, &wait);
+
+ wait_event(cache->caching_q, block_group_cache_done(cache) ||
+ (cache->free_space >= num_bytes));
+ return 0;
+}
+
+enum btrfs_loop_type {
+ LOOP_CACHED_ONLY = 0,
+ LOOP_CACHING_NOWAIT = 1,
+ LOOP_CACHING_WAIT = 2,
+ LOOP_ALLOC_CHUNK = 3,
+ LOOP_NO_EMPTY_SIZE = 4,
+};
+
+/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
* ins->objectid == block start
@@ -3460,6 +3634,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
+ bool found_uncached_bg = false;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3666,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
- if (!last_ptr) {
+ if (!last_ptr)
empty_cluster = 0;
- loop = 1;
- }
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
- if (block_group && block_group_bits(block_group, data)) {
+ /*
+ * we don't want to use the block group if it doesn't match our
+ * allocation bits, or if its not cached.
+ */
+ if (block_group && block_group_bits(block_group, data) &&
+ block_group_cache_done(block_group)) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -3522,21 +3700,35 @@ search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups, list) {
u64 offset;
+ int cached;
atomic_inc(&block_group->count);
search_start = block_group->key.objectid;
have_block_group:
- if (unlikely(!block_group->cached)) {
- mutex_lock(&block_group->cache_mutex);
- ret = cache_block_group(root, block_group);
- mutex_unlock(&block_group->cache_mutex);
- if (ret) {
- btrfs_put_block_group(block_group);
- break;
+ if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ /*
+ * we want to start caching kthreads, but not too many
+ * right off the bat so we don't overwhelm the system,
+ * so only start them if there are less than 2 and we're
+ * in the initial allocation phase.
+ */
+ if (loop > LOOP_CACHING_NOWAIT ||
+ atomic_read(&space_info->caching_threads) < 2) {
+ ret = cache_block_group(block_group);
+ BUG_ON(ret);
}
}
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
+ found_uncached_bg = true;
+
+ /* if we only want cached bgs, loop */
+ if (loop == LOOP_CACHED_ONLY)
+ goto loop;
+ }
+
if (unlikely(block_group->ro))
goto loop;
@@ -3615,14 +3807,21 @@ refill_cluster:
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_cluster + empty_size);
+ goto have_block_group;
}
+
/*
* at this point we either didn't find a cluster
* or we weren't able to allocate a block from our
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
- if (loop < 2) {
+ if (loop < LOOP_NO_EMPTY_SIZE) {
btrfs_return_cluster_to_free_space(NULL,
last_ptr);
spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3832,17 @@ refill_cluster:
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
- if (!offset)
+ if (!offset && (cached || (!cached &&
+ loop == LOOP_CACHING_NOWAIT))) {
goto loop;
+ } else if (!offset && (!cached &&
+ loop > LOOP_CACHING_NOWAIT)) {
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_size);
+ goto have_block_group;
+ }
checks:
search_start = stripe_align(root, offset);
-
/* move on to the next group */
if (search_start + num_bytes >= search_end) {
btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3888,26 @@ loop:
}
up_read(&space_info->groups_sem);
- /* loop == 0, try to find a clustered alloc in every block group
- * loop == 1, try again after forcing a chunk allocation
- * loop == 2, set empty_size and empty_cluster to 0 and try again
+ /* LOOP_CACHED_ONLY, only search fully cached block groups
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, but
+ * dont wait foR them to finish caching
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
*/
- if (!ins->objectid && loop < 3 &&
- (empty_size || empty_cluster || allowed_chunk_alloc)) {
- if (loop >= 2) {
+ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+ (found_uncached_bg || empty_size || empty_cluster ||
+ allowed_chunk_alloc)) {
+ if (found_uncached_bg) {
+ found_uncached_bg = false;
+ if (loop < LOOP_CACHING_WAIT) {
+ loop++;
+ goto search;
+ }
+ }
+
+ if (loop == LOOP_ALLOC_CHUNK) {
empty_size = 0;
empty_cluster = 0;
}
@@ -3702,7 +3920,7 @@ loop:
space_info->force_alloc = 1;
}
- if (loop < 3) {
+ if (loop < LOOP_NO_EMPTY_SIZE) {
loop++;
goto search;
}
@@ -3798,7 +4016,7 @@ again:
num_bytes, data, 1);
goto again;
}
- if (ret) {
+ if (ret == -ENOSPC) {
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4024,6 @@ again:
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
dump_space_info(sinfo, num_bytes);
- BUG();
}
return ret;
@@ -3844,7 +4061,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
empty_size, hint_byte, search_end, ins,
data);
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ if (!ret)
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
+
return ret;
}
@@ -4006,9 +4225,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- mutex_lock(&block_group->cache_mutex);
- cache_block_group(root, block_group);
- mutex_unlock(&block_group->cache_mutex);
+ cache_block_group(block_group);
+ wait_event(block_group->caching_q,
+ block_group_cache_done(block_group));
ret = btrfs_remove_free_space(block_group, ins->objectid,
ins->offset);
@@ -4039,7 +4258,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
empty_size, hint_byte, search_end,
ins, 0);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -6955,11 +7175,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
&info->block_group_cache_tree);
spin_unlock(&info->block_group_cache_lock);
- btrfs_remove_free_space_cache(block_group);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
up_write(&block_group->space_info->groups_sem);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_event(block_group->caching_q,
+ block_group_cache_done(block_group));
+
+ btrfs_remove_free_space_cache(block_group);
+
WARN_ON(atomic_read(&block_group->count) != 1);
kfree(block_group);
@@ -7025,9 +7250,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- mutex_init(&cache->cache_mutex);
+ cache->fs_info = info;
+ init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
+
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
@@ -7036,6 +7271,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
key.objectid = found_key.objectid + found_key.offset;
btrfs_release_path(root, path);
cache->flags = btrfs_block_group_flags(&cache->item);
+ cache->sectorsize = root->sectorsize;
+
+ remove_sb_from_cache(root, cache);
+
+ /*
+ * check for two cases, either we are full, and therefore
+ * don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all
+ * the space in and be done with it. This saves us _alot_ of
+ * time, particularly in the full case.
+ */
+ if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ cache->cached = BTRFS_CACHE_FINISHED;
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, root->fs_info,
+ found_key.objectid,
+ found_key.objectid +
+ found_key.offset);
+ }
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
@@ -7079,10 +7334,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.objectid = chunk_offset;
cache->key.offset = size;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = root->sectorsize;
+
+ /*
+ * we only want to have 32k of ram per block group for keeping track
+ * of free space, and if we pass 1/2 of that we want to start
+ * converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- mutex_init(&cache->cache_mutex);
+ init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -7091,6 +7355,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
+ cache->cached = BTRFS_CACHE_FINISHED;
+ remove_sb_from_cache(root, cache);
+
+ add_new_free_space(cache, root->fs_info, chunk_offset,
+ chunk_offset + size);
+
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
@@ -7149,7 +7419,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
spin_unlock(&root->fs_info->block_group_cache_lock);
- btrfs_remove_free_space_cache(block_group);
+
down_write(&block_group->space_info->groups_sem);
/*
* we must use list_del_init so people can check to see if they
@@ -7158,11 +7428,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
list_del_init(&block_group->list);
up_write(&block_group->space_info->groups_sem);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_event(block_group->caching_q,
+ block_group_cache_done(block_group));
+
+ btrfs_remove_free_space_cache(block_group);
+
spin_lock(&block_group->space_info->lock);
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
spin_unlock(&block_group->space_info->lock);
- block_group->space_info->full = 0;
+
+ btrfs_clear_space_info_full(root->fs_info);
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a..5edcee3a617 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/math64.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
-struct btrfs_free_space {
- struct rb_node bytes_index;
- struct rb_node offset_index;
- u64 offset;
- u64 bytes;
-};
+#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-static int tree_insert_offset(struct rb_root *root, u64 offset,
- struct rb_node *node)
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+ u64 offset)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_free_space *info;
+ BUG_ON(offset < bitmap_start);
+ offset -= bitmap_start;
+ return (unsigned long)(div64_u64(offset, sectorsize));
+}
- while (*p) {
- parent = *p;
- info = rb_entry(parent, struct btrfs_free_space, offset_index);
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+{
+ return (unsigned long)(div64_u64(bytes, sectorsize));
+}
- if (offset < info->offset)
- p = &(*p)->rb_left;
- else if (offset > info->offset)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 offset)
+{
+ u64 bitmap_start;
+ u64 bytes_per_bitmap;
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
+ bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+ bitmap_start = offset - block_group->key.objectid;
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start *= bytes_per_bitmap;
+ bitmap_start += block_group->key.objectid;
- return 0;
+ return bitmap_start;
}
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
- struct rb_node *node)
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node, int bitmap)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
while (*p) {
parent = *p;
- info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
- if (bytes < info->bytes)
+ if (offset < info->offset) {
p = &(*p)->rb_left;
- else
+ } else if (offset > info->offset) {
p = &(*p)->rb_right;
+ } else {
+ /*
+ * we could have a bitmap entry and an extent entry
+ * share the same offset. If this is the case, we want
+ * the extent entry to always be found first if we do a
+ * linear search through the tree, since we want to have
+ * the quickest allocation time, and allocating from an
+ * extent is faster than allocating from a bitmap. So
+ * if we're inserting a bitmap and we find an entry at
+ * this offset, we want to go right, or after this entry
+ * logically. If we are inserting an extent and we've
+ * found a bitmap, we want to go left, or before
+ * logically.
+ */
+ if (bitmap) {
+ WARN_ON(info->bitmap);
+ p = &(*p)->rb_right;
+ } else {
+ WARN_ON(!info->bitmap);
+ p = &(*p)->rb_left;
+ }
+ }
}
rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
/*
* searches the tree for the given offset.
*
- * fuzzy == 1: this is used for allocations where we are given a hint of where
- * to look for free space. Because the hint may not be completely on an offset
- * mark, or the hint may no longer point to free space we need to fudge our
- * results a bit. So we look for free space starting at or after offset with at
- * least bytes size. We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search. Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
*/
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
- u64 offset, u64 bytes,
- int fuzzy)
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_block_group_cache *block_group,
+ u64 offset, int bitmap_only, int fuzzy)
{
- struct rb_node *n = root->rb_node;
- struct btrfs_free_space *entry, *ret = NULL;
+ struct rb_node *n = block_group->free_space_offset.rb_node;
+ struct btrfs_free_space *entry, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+ while (1) {
+ if (!n) {
+ entry = NULL;
+ break;
+ }
- while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
- if (offset < entry->offset) {
- if (fuzzy &&
- (!ret || entry->offset < ret->offset) &&
- (bytes <= entry->bytes))
- ret = entry;
+ if (offset < entry->offset)
n = n->rb_left;
- } else if (offset > entry->offset) {
- if (fuzzy &&
- (entry->offset + entry->bytes - 1) >= offset &&
- bytes <= entry->bytes) {
- ret = entry;
- break;
- }
+ else if (offset > entry->offset)
n = n->rb_right;
- } else {
- if (bytes > entry->bytes) {
- n = n->rb_right;
- continue;
- }
- ret = entry;
+ else
break;
- }
}
- return ret;
-}
+ if (bitmap_only) {
+ if (!entry)
+ return NULL;
+ if (entry->bitmap)
+ return entry;
-/*
- * return a chunk at least bytes size, as close to offset that we can get.
- */
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
- u64 offset, u64 bytes)
-{
- struct rb_node *n = root->rb_node;
- struct btrfs_free_space *entry, *ret = NULL;
-
- while (n) {
- entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+ /*
+ * bitmap entry and extent entry may share same offset,
+ * in that case, bitmap entry comes after extent entry.
+ */
+ n = rb_next(n);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->offset != offset)
+ return NULL;
- if (bytes < entry->bytes) {
+ WARN_ON(!entry->bitmap);
+ return entry;
+ } else if (entry) {
+ if (entry->bitmap) {
/*
- * We prefer to get a hole size as close to the size we
- * are asking for so we don't take small slivers out of
- * huge holes, but we also want to get as close to the
- * offset as possible so we don't have a whole lot of
- * fragmentation.
+ * if previous extent entry covers the offset,
+ * we should return it instead of the bitmap entry
*/
- if (offset <= entry->offset) {
- if (!ret)
- ret = entry;
- else if (entry->bytes < ret->bytes)
- ret = entry;
- else if (entry->offset < ret->offset)
- ret = entry;
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ entry = prev;
+ break;
+ }
}
- n = n->rb_left;
- } else if (bytes > entry->bytes) {
- n = n->rb_right;
+ }
+ return entry;
+ }
+
+ if (!prev)
+ return NULL;
+
+ /* find last entry before the 'offset' */
+ entry = prev;
+ if (entry->offset > offset) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ BUG_ON(entry->offset > offset);
} else {
- /*
- * Ok we may have multiple chunks of the wanted size,
- * so we don't want to take the first one we find, we
- * want to take the one closest to our given offset, so
- * keep searching just in case theres a better match.
- */
- n = n->rb_right;
- if (offset > entry->offset)
- continue;
- else if (!ret || entry->offset < ret->offset)
- ret = entry;
+ if (fuzzy)
+ return entry;
+ else
+ return NULL;
}
}
- return ret;
+ if (entry->bitmap) {
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ return prev;
+ break;
+ }
+ }
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ return entry;
+ } else if (entry->offset + entry->bytes > offset)
+ return entry;
+
+ if (!fuzzy)
+ return NULL;
+
+ while (1) {
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ break;
+ } else {
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ }
+ return entry;
}
static void unlink_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *info)
{
rb_erase(&info->offset_index, &block_group->free_space_offset);
- rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+ block_group->free_extents--;
+ block_group->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
{
int ret = 0;
-
- BUG_ON(!info->bytes);
+ BUG_ON(!info->bitmap && !info->bytes);
ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
- &info->offset_index);
+ &info->offset_index, (info->bitmap != NULL));
if (ret)
return ret;
- ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
- &info->bytes_index);
- if (ret)
- return ret;
+ block_group->free_space += info->bytes;
+ block_group->free_extents++;
+ return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+ u64 max_bytes, possible_bytes;
+
+ /*
+ * The goal is to keep the total amount of memory used per 1gb of space
+ * at or below 32k, so we need to adjust how much memory we allow to be
+ * used by extent based free space tracking
+ */
+ max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+
+ possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+ (sizeof(struct btrfs_free_space) *
+ block_group->extents_thresh);
+
+ if (possible_bytes > max_bytes) {
+ int extent_bytes = max_bytes -
+ (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+
+ if (extent_bytes <= 0) {
+ block_group->extents_thresh = 0;
+ return;
+ }
+
+ block_group->extents_thresh = extent_bytes /
+ (sizeof(struct btrfs_free_space));
+ }
+}
+
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ clear_bit(i, info->bitmap);
+
+ info->bytes -= bytes;
+ block_group->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ set_bit(i, info->bitmap);
+
+ info->bytes += bytes;
+ block_group->free_space += bytes;
+}
+
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes)
+{
+ unsigned long found_bits = 0;
+ unsigned long bits, i;
+ unsigned long next_zero;
+
+ i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+ max_t(u64, *offset, bitmap_info->offset));
+ bits = bytes_to_bits(*bytes, block_group->sectorsize);
+
+ for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(bitmap_info->bitmap,
+ BITS_PER_BITMAP, i);
+ if ((next_zero - i) >= bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (found_bits) {
+ *offset = (u64)(i * block_group->sectorsize) +
+ bitmap_info->offset;
+ *bytes = (u64)(found_bits) * block_group->sectorsize;
+ return 0;
+ }
+
+ return -1;
+}
+
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+ *block_group, u64 *offset,
+ u64 *bytes, int debug)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret;
+
+ if (!block_group->free_space_offset.rb_node)
+ return NULL;
+
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, *offset),
+ 0, 1);
+ if (!entry)
+ return NULL;
+
+ for (node = &entry->offset_index; node; node = rb_next(node)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (entry->bytes < *bytes)
+ continue;
+
+ if (entry->bitmap) {
+ ret = search_bitmap(block_group, entry, offset, bytes);
+ if (!ret)
+ return entry;
+ continue;
+ }
+
+ *offset = entry->offset;
+ *bytes = entry->bytes;
+ return entry;
+ }
+
+ return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset)
+{
+ u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ int max_bitmaps = (int)div64_u64(block_group->key.offset +
+ bytes_per_bg - 1, bytes_per_bg);
+ BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+
+ info->offset = offset_to_bitmap(block_group, offset);
+ link_free_space(block_group, info);
+ block_group->total_bitmaps++;
+
+ recalculate_thresholds(block_group);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info,
+ u64 *offset, u64 *bytes)
+{
+ u64 end;
+ u64 search_start, search_bytes;
+ int ret;
+
+again:
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+
+ /*
+ * XXX - this can go away after a few releases.
+ *
+ * since the only user of btrfs_remove_free_space is the tree logging
+ * stuff, and the only way to test that is under crash conditions, we
+ * want to have this debug stuff here just in case somethings not
+ * working. Search the bitmap for the space we are trying to use to
+ * make sure its actually there. If its not there then we need to stop
+ * because something has gone wrong.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ BUG_ON(ret < 0 || search_start != *offset);
+
+ if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset,
+ end - *offset + 1);
+ *bytes -= end - *offset + 1;
+ *offset = end + 1;
+ } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+ *bytes = 0;
+ }
+
+ if (*bytes) {
+ struct rb_node *next = rb_next(&bitmap_info->offset_index);
+ if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ /*
+ * no entry after this bitmap, but we still have bytes to
+ * remove, so something has gone wrong.
+ */
+ if (!next)
+ return -EINVAL;
+
+ bitmap_info = rb_entry(next, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * if the next entry isn't a bitmap we need to return to let the
+ * extent stuff do its work.
+ */
+ if (!bitmap_info->bitmap)
+ return -EAGAIN;
+
+ /*
+ * Ok the next item is a bitmap, but it may not actually hold
+ * the information for the rest of this free space stuff, so
+ * look for it, and if we don't find it return so we can try
+ * everything over again.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ if (ret < 0 || search_start != *offset)
+ return -EAGAIN;
+
+ goto again;
+ } else if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ return 0;
+}
+
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ int added = 0;
+ u64 bytes, offset, end;
+ int ret;
+
+ /*
+ * If we are below the extents threshold then we can add this as an
+ * extent, and don't have to deal with the bitmap
+ */
+ if (block_group->free_extents < block_group->extents_thresh &&
+ info->bytes > block_group->sectorsize * 4)
+ return 0;
+
+ /*
+ * some block groups are so tiny they can't be enveloped by a bitmap, so
+ * don't even bother to create a bitmap for this
+ */
+ if (BITS_PER_BITMAP * block_group->sectorsize >
+ block_group->key.offset)
+ return 0;
+
+ bytes = info->bytes;
+ offset = info->offset;
+
+again:
+ bitmap_info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ BUG_ON(added);
+ goto new_bitmap;
+ }
+
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+
+ if (offset >= bitmap_info->offset && offset + bytes > end) {
+ bitmap_set_bits(block_group, bitmap_info, offset,
+ end - offset);
+ bytes -= end - offset;
+ offset = end;
+ added = 0;
+ } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+ bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+ bytes = 0;
+ } else {
+ BUG();
+ }
+
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ } else
+ goto again;
+
+new_bitmap:
+ if (info && info->bitmap) {
+ add_new_bitmap(block_group, info, offset);
+ added = 1;
+ info = NULL;
+ goto again;
+ } else {
+ spin_unlock(&block_group->tree_lock);
+
+ /* no pre-allocated info, allocate a new one */
+ if (!info) {
+ info = kzalloc(sizeof(struct btrfs_free_space),
+ GFP_NOFS);
+ if (!info) {
+ spin_lock(&block_group->tree_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* allocate the bitmap */
+ info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ spin_lock(&block_group->tree_lock);
+ if (!info->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto again;
+ }
+
+out:
+ if (info) {
+ if (info->bitmap)
+ kfree(info->bitmap);
+ kfree(info);
+ }
return ret;
}
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes)
{
- struct btrfs_free_space *right_info;
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *right_info = NULL;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *info = NULL;
int ret = 0;
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
* are adding, if there is remove that struct and add a new one to
* cover the entire range
*/
- right_info = tree_search_offset(&block_group->free_space_offset,
- offset+bytes, 0, 0);
- left_info = tree_search_offset(&block_group->free_space_offset,
- offset-1, 0, 1);
+ right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+ if (right_info && rb_prev(&right_info->offset_index))
+ left_info = rb_entry(rb_prev(&right_info->offset_index),
+ struct btrfs_free_space, offset_index);
+ else
+ left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+
+ /*
+ * If there was no extent directly to the left or right of this new
+ * extent then we know we're going to have to allocate a new extent, so
+ * before we do that see if we need to drop this into a bitmap
+ */
+ if ((!left_info || left_info->bitmap) &&
+ (!right_info || right_info->bitmap)) {
+ ret = insert_into_bitmap(block_group, info);
+
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
- if (right_info) {
+ if (right_info && !right_info->bitmap) {
unlink_free_space(block_group, right_info);
info->bytes += right_info->bytes;
kfree(right_info);
}
- if (left_info && left_info->offset + left_info->bytes == offset) {
+ if (left_info && !left_info->bitmap &&
+ left_info->offset + left_info->bytes == offset) {
unlink_free_space(block_group, left_info);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
ret = link_free_space(block_group, info);
if (ret)
kfree(info);
-
+out:
spin_unlock(&block_group->tree_lock);
if (ret) {
- printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+ printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
BUG_ON(ret == -EEXIST);
}
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space *info;
+ struct btrfs_free_space *next_info = NULL;
int ret = 0;
spin_lock(&block_group->tree_lock);
- info = tree_search_offset(&block_group->free_space_offset, offset, 0,
- 1);
- if (info && info->offset == offset) {
- if (info->bytes < bytes) {
- printk(KERN_ERR "Found free space at %llu, size %llu,"
- "trying to use %llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)bytes);
+again:
+ info = tree_search_offset(block_group, offset, 0, 0);
+ if (!info) {
+ /*
+ * oops didn't find an extent that matched the space we wanted
+ * to remove, look for a bitmap instead
+ */
+ info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!info) {
+ WARN_ON(1);
+ goto out_lock;
+ }
+ }
+
+ if (info->bytes < bytes && rb_next(&info->offset_index)) {
+ u64 end;
+ next_info = rb_entry(rb_next(&info->offset_index),
+ struct btrfs_free_space,
+ offset_index);
+
+ if (next_info->bitmap)
+ end = next_info->offset + BITS_PER_BITMAP *
+ block_group->sectorsize - 1;
+ else
+ end = next_info->offset + next_info->bytes;
+
+ if (next_info->bytes < bytes ||
+ next_info->offset > offset || offset > end) {
+ printk(KERN_CRIT "Found free space at %llu, size %llu,"
+ " trying to use %llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (unsigned long long)bytes);
WARN_ON(1);
ret = -EINVAL;
- spin_unlock(&block_group->tree_lock);
- goto out;
+ goto out_lock;
}
- unlink_free_space(block_group, info);
- if (info->bytes == bytes) {
- kfree(info);
- spin_unlock(&block_group->tree_lock);
- goto out;
+ info = next_info;
+ }
+
+ if (info->bytes == bytes) {
+ unlink_free_space(block_group, info);
+ if (info->bitmap) {
+ kfree(info->bitmap);
+ block_group->total_bitmaps--;
}
+ kfree(info);
+ goto out_lock;
+ }
+ if (!info->bitmap && info->offset == offset) {
+ unlink_free_space(block_group, info);
info->offset += bytes;
info->bytes -= bytes;
+ link_free_space(block_group, info);
+ goto out_lock;
+ }
- ret = link_free_space(block_group, info);
- spin_unlock(&block_group->tree_lock);
- BUG_ON(ret);
- } else if (info && info->offset < offset &&
- info->offset + info->bytes >= offset + bytes) {
+ if (!info->bitmap && info->offset <= offset &&
+ info->offset + info->bytes >= offset + bytes) {
u64 old_start = info->offset;
/*
* we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
info->offset = offset + bytes;
info->bytes = old_end - info->offset;
ret = link_free_space(block_group, info);
- BUG_ON(ret);
+ WARN_ON(ret);
+ if (ret)
+ goto out_lock;
} else {
/* the hole we're creating ends at the end
* of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
kfree(info);
}
spin_unlock(&block_group->tree_lock);
- /* step two, insert a new info struct to cover anything
- * before the hole
+
+ /* step two, insert a new info struct to cover
+ * anything before the hole
*/
ret = btrfs_add_free_space(block_group, old_start,
offset - old_start);
- BUG_ON(ret);
- } else {
- spin_unlock(&block_group->tree_lock);
- if (!info) {
- printk(KERN_ERR "couldn't find space %llu to free\n",
- (unsigned long long)offset);
- printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
- block_group->cached,
- (unsigned long long)block_group->key.objectid,
- (unsigned long long)block_group->key.offset);
- btrfs_dump_free_space(block_group, bytes);
- } else if (info) {
- printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
- "but wanted offset=%llu bytes=%llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)offset,
- (unsigned long long)bytes);
- }
- WARN_ON(1);
+ WARN_ON(ret);
+ goto out;
}
+
+ ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+ if (ret == -EAGAIN)
+ goto again;
+ BUG_ON(ret);
+out_lock:
+ spin_unlock(&block_group->tree_lock);
out:
return ret;
}
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes)
count++;
- printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+ printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
(unsigned long long)info->offset,
- (unsigned long long)info->bytes);
+ (unsigned long long)info->bytes,
+ (info->bitmap) ? "yes" : "no");
}
+ printk(KERN_INFO "block group has cluster?: %s\n",
+ list_empty(&block_group->cluster_list) ? "no" : "yes");
printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
"\n", count);
}
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
{
struct btrfs_free_space *entry;
struct rb_node *node;
+ bool bitmap;
spin_lock(&cluster->lock);
if (cluster->block_group != block_group)
goto out;
+ bitmap = cluster->points_to_bitmap;
+ cluster->block_group = NULL;
cluster->window_start = 0;
+ list_del_init(&cluster->block_group_list);
+ cluster->points_to_bitmap = false;
+
+ if (bitmap)
+ goto out;
+
node = rb_first(&cluster->root);
- while(node) {
+ while (node) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
- link_free_space(block_group, entry);
+ BUG_ON(entry->bitmap);
+ tree_insert_offset(&block_group->free_space_offset,
+ entry->offset, &entry->offset_index, 0);
}
- list_del_init(&cluster->block_group_list);
-
- btrfs_put_block_group(cluster->block_group);
- cluster->block_group = NULL;
cluster->root.rb_node = NULL;
+
out:
spin_unlock(&cluster->lock);
+ btrfs_put_block_group(block_group);
return 0;
}
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
struct btrfs_free_space *info;
struct rb_node *node;
struct btrfs_free_cluster *cluster;
- struct btrfs_free_cluster *safe;
+ struct list_head *head;
spin_lock(&block_group->tree_lock);
-
- list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
- block_group_list) {
+ while ((head = block_group->cluster_list.next) !=
+ &block_group->cluster_list) {
+ cluster = list_entry(head, struct btrfs_free_cluster,
+ block_group_list);
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
+ if (need_resched()) {
+ spin_unlock(&block_group->tree_lock);
+ cond_resched();
+ spin_lock(&block_group->tree_lock);
+ }
}
- while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, bytes_index);
+ while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
unlink_free_space(block_group, info);
+ if (info->bitmap)
+ kfree(info->bitmap);
kfree(info);
if (need_resched()) {
spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
spin_lock(&block_group->tree_lock);
}
}
+
spin_unlock(&block_group->tree_lock);
}
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes, u64 empty_size)
{
struct btrfs_free_space *entry = NULL;
+ u64 bytes_search = bytes + empty_size;
u64 ret = 0;
spin_lock(&block_group->tree_lock);
- entry = tree_search_offset(&block_group->free_space_offset, offset,
- bytes + empty_size, 1);
+ entry = find_free_space(block_group, &offset, &bytes_search, 0);
if (!entry)
- entry = tree_search_bytes(&block_group->free_space_bytes,
- offset, bytes + empty_size);
- if (entry) {
+ goto out;
+
+ ret = offset;
+ if (entry->bitmap) {
+ bitmap_clear_bits(block_group, entry, offset, bytes);
+ if (!entry->bytes) {
+ unlink_free_space(block_group, entry);
+ kfree(entry->bitmap);
+ kfree(entry);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+ } else {
unlink_free_space(block_group, entry);
- ret = entry->offset;
entry->offset += bytes;
entry->bytes -= bytes;
-
if (!entry->bytes)
kfree(entry);
else
link_free_space(block_group, entry);
}
+
+out:
spin_unlock(&block_group->tree_lock);
return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
return ret;
}
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 bytes, u64 min_start)
+{
+ struct btrfs_free_space *entry;
+ int err;
+ u64 search_start = cluster->window_start;
+ u64 search_bytes = bytes;
+ u64 ret = 0;
+
+ spin_lock(&block_group->tree_lock);
+ spin_lock(&cluster->lock);
+
+ if (!cluster->points_to_bitmap)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ /*
+ * search_start is the beginning of the bitmap, but at some point it may
+ * be a good idea to point to the actual start of the free area in the
+ * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+ * to 1 to make sure we get the bitmap entry
+ */
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, search_start),
+ 1, 0);
+ if (!entry || !entry->bitmap)
+ goto out;
+
+ search_start = min_start;
+ search_bytes = bytes;
+
+ err = search_bitmap(block_group, entry, &search_start,
+ &search_bytes);
+ if (err)
+ goto out;
+
+ ret = search_start;
+ bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
/*
* given a cluster, try to allocate 'bytes' from it, returns 0
* if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
struct rb_node *node;
u64 ret = 0;
+ if (cluster->points_to_bitmap)
+ return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+ min_start);
+
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
}
out:
spin_unlock(&cluster->lock);
+
return ret;
}
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *entry,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 min_bytes)
+{
+ unsigned long next_zero;
+ unsigned long i;
+ unsigned long search_bits;
+ unsigned long total_bits;
+ unsigned long found_bits;
+ unsigned long start = 0;
+ unsigned long total_found = 0;
+ bool found = false;
+
+ i = offset_to_bit(entry->offset, block_group->sectorsize,
+ max_t(u64, offset, entry->offset));
+ search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+
+again:
+ found_bits = 0;
+ for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(entry->bitmap,
+ BITS_PER_BITMAP, i);
+ if (next_zero - i >= search_bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (!found_bits)
+ return -1;
+
+ if (!found) {
+ start = i;
+ found = true;
+ }
+
+ total_found += found_bits;
+
+ if (cluster->max_size < found_bits * block_group->sectorsize)
+ cluster->max_size = found_bits * block_group->sectorsize;
+
+ if (total_found < total_bits) {
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+ if (i - start > total_bits * 2) {
+ total_found = 0;
+ cluster->max_size = 0;
+ found = false;
+ }
+ goto again;
+ }
+
+ cluster->window_start = start * block_group->sectorsize +
+ entry->offset;
+ cluster->points_to_bitmap = true;
+
+ return 0;
+}
+
/*
* here we try to find a cluster of blocks in a block group. The goal
* is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
struct btrfs_free_space *next;
- struct btrfs_free_space *last;
+ struct btrfs_free_space *last = NULL;
u64 min_bytes;
u64 window_start;
u64 window_free;
u64 max_extent = 0;
- int total_retries = 0;
+ bool found_bitmap = false;
int ret;
/* for metadata, allow allocates with more holes */
@@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
again:
- min_bytes = min(min_bytes, bytes + empty_size);
- entry = tree_search_bytes(&block_group->free_space_bytes,
- offset, min_bytes);
+ entry = tree_search_offset(block_group, offset, found_bitmap, 1);
if (!entry) {
ret = -ENOSPC;
goto out;
}
+
+ /*
+ * If found_bitmap is true, we exhausted our search for extent entries,
+ * and we just want to search all of the bitmaps that we can find, and
+ * ignore any extent entries we find.
+ */
+ while (entry->bitmap || found_bitmap ||
+ (!entry->bitmap && entry->bytes < min_bytes)) {
+ struct rb_node *node = rb_next(&entry->offset_index);
+
+ if (entry->bitmap && entry->bytes > bytes + empty_size) {
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+ offset, bytes + empty_size,
+ min_bytes);
+ if (!ret)
+ goto got_it;
+ }
+
+ if (!node) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ /*
+ * We already searched all the extent entries from the passed in offset
+ * to the end and didn't find enough space for the cluster, and we also
+ * didn't find any bitmaps that met our criteria, just go ahead and exit
+ */
+ if (found_bitmap) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ cluster->points_to_bitmap = false;
window_start = entry->offset;
window_free = entry->bytes;
last = entry;
max_extent = entry->bytes;
- while(1) {
+ while (1) {
/* out window is just right, lets fill it */
if (window_free >= bytes + empty_size)
break;
node = rb_next(&last->offset_index);
if (!node) {
+ if (found_bitmap)
+ goto again;
ret = -ENOSPC;
goto out;
}
next = rb_entry(node, struct btrfs_free_space, offset_index);
/*
+ * we found a bitmap, so if this search doesn't result in a
+ * cluster, we know to go and search again for the bitmaps and
+ * start looking for space there
+ */
+ if (next->bitmap) {
+ if (!found_bitmap)
+ offset = next->offset;
+ found_bitmap = true;
+ last = next;
+ continue;
+ }
+
+ /*
* we haven't filled the empty size and the window is
* very large. reset and try again
*/
@@ -655,19 +1289,6 @@ again:
window_free = entry->bytes;
last = entry;
max_extent = 0;
- total_retries++;
- if (total_retries % 64 == 0) {
- if (min_bytes >= (bytes + empty_size)) {
- ret = -ENOSPC;
- goto out;
- }
- /*
- * grow our allocation a bit, we're not having
- * much luck
- */
- min_bytes *= 2;
- goto again;
- }
} else {
last = next;
window_free += next->bytes;
@@ -685,11 +1306,19 @@ again:
* The cluster includes an rbtree, but only uses the offset index
* of each free space cache entry.
*/
- while(1) {
+ while (1) {
node = rb_next(&entry->offset_index);
- unlink_free_space(block_group, entry);
+ if (entry->bitmap && node) {
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ } else if (entry->bitmap && !node) {
+ break;
+ }
+
+ rb_erase(&entry->offset_index, &block_group->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
- &entry->offset_index);
+ &entry->offset_index, 0);
BUG_ON(ret);
if (!node || entry == last)
@@ -697,8 +1326,10 @@ again:
entry = rb_entry(node, struct btrfs_free_space, offset_index);
}
- ret = 0;
+
cluster->max_size = max_extent;
+got_it:
+ ret = 0;
atomic_inc(&block_group->count);
list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
cluster->block_group = block_group;
@@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root.rb_node = NULL;
cluster->max_size = 0;
+ cluster->points_to_bitmap = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb876405..890a8e79011 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
#ifndef __BTRFS_FREE_SPACE_CACHE
#define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+ unsigned long *bitmap;
+ struct list_head list;
+};
+
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e33..59cba180fe8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
if (root->ref_cows)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
path = btrfs_alloc_path();
- path->reada = -1;
BUG_ON(!path);
+ path->reada = -1;
/* FIXME, add redo link to tree so we don't leak on crash */
key.objectid = inode->i_ino;
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_inode *entry;
- struct rb_node **p = &root->inode_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+
+again:
+ p = &root->inode_tree.rb_node;
+ parent = NULL;
spin_lock(&root->inode_lock);
while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
entry = rb_entry(parent, struct btrfs_inode, rb_node);
if (inode->i_ino < entry->vfs_inode.i_ino)
- p = &(*p)->rb_left;
+ p = &parent->rb_left;
else if (inode->i_ino > entry->vfs_inode.i_ino)
- p = &(*p)->rb_right;
+ p = &parent->rb_right;
else {
WARN_ON(!(entry->vfs_inode.i_state &
(I_WILL_FREE | I_FREEING | I_CLEAR)));
- break;
+ rb_erase(parent, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ goto again;
}
}
rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- spin_lock(&root->inode_lock);
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- spin_unlock(&root->inode_lock);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
}
+ spin_unlock(&root->inode_lock);
}
static noinline void init_btrfs_i(struct inode *inode)
@@ -4785,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* and the replacement file is large. Start IO on it now so
* we don't add too much work to the end of the transaction
*/
- if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
- new_inode->i_size &&
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682..7b2f401e604 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
.nr_to_write = mapping->nrpages * 2,
.range_start = start,
.range_end = end,
- .for_writepages = 1,
};
return btrfs_writepages(mapping, &wbc);
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a3..0d126be22b6 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
}
printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
(unsigned long long)btrfs_header_bytenr(c),
- btrfs_header_level(c), nr,
+ level, nr,
(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
if (btrfs_is_leaf(next) &&
- btrfs_header_level(c) != 1)
+ level != 1)
BUG();
if (btrfs_header_level(next) !=
- btrfs_header_level(c) - 1)
+ level - 1)
BUG();
btrfs_print_tree(root, next);
free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 00839793477..c04f7f21260 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
err = ret;
goto out;
}
+ if (ret > 0 && path2->slots[level] > 0)
+ path2->slots[level]--;
eb = path2->nodes[level];
WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+ path->lowest_level = 0;
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -2550,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
/* make sure the dirty trick played by the caller work */
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- first_index, last_index);
+ while (1) {
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ first_index, last_index);
+ if (ret != -EBUSY)
+ break;
+ schedule_timeout(HZ/10);
+ }
if (ret)
goto out_unlock;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56e..cdbb5022da5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
}
}
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+}
+
/*
* either allocate a new transaction or hop into the existing one
*/
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
btrfs_write_dirty_block_groups(trans, root);
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
-
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
&root->root_key,
&root->root_item);
BUG_ON(ret);
- btrfs_write_dirty_block_groups(trans, root);
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_write_dirty_block_groups(trans, root);
BUG_ON(ret);
}
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+
+ if (root != root->fs_info->extent_root)
+ switch_commit_root(root);
+
return 0;
}
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
-
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
}
+
+ down_write(&fs_info->extent_commit_sem);
+ switch_commit_root(fs_info->extent_root);
+ up_write(&fs_info->extent_commit_sem);
+
return 0;
}
@@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_update_reloc_root(trans, root);
if (root->commit_root != root->node) {
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ switch_commit_root(root);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
super->root_level = root_item->level;
}
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->in_commit;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
- if (flush_on_commit || snap_pending) {
- if (flush_on_commit)
- btrfs_start_delalloc_inodes(root);
+ if (flush_on_commit) {
+ btrfs_start_delalloc_inodes(root);
+ ret = btrfs_wait_ordered_extents(root, 0);
+ BUG_ON(ret);
+ } else if (snap_pending) {
ret = btrfs_wait_ordered_extents(root, 1);
BUG_ON(ret);
}
@@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- free_extent_buffer(root->fs_info->tree_root->commit_root);
- root->fs_info->tree_root->commit_root =
- btrfs_root_node(root->fs_info->tree_root);
+ switch_commit_root(root->fs_info->tree_root);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- root->fs_info->chunk_root->commit_root =
- btrfs_root_node(root->fs_info->chunk_root);
+ switch_commit_root(root->fs_info->chunk_root);
update_super_roots(root);
@@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cur_trans->commit_done = 1;
root->fs_info->last_trans_committed = cur_trans->transid;
+
wake_up(&cur_trans->commit_wait);
put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e..663c6740491 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1..d91b0de7c50 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
return -ENOENT;
inode = read_one_inode(root, key->objectid);
- BUG_ON(!dir);
+ BUG_ON(!inode);
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd76..5cf405b0828 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
num_run++;
batch_run++;
- if (bio_sync(cur))
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
if (need_resched()) {
@@ -721,7 +721,8 @@ error:
*/
static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
- u64 num_bytes, u64 *start)
+ u64 num_bytes, u64 *start,
+ u64 *max_avail)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
goto error;
- ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret < 0)
- goto error;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ start_found = 1;
+ }
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
while (1) {
@@ -803,6 +808,10 @@ no_more_items:
if (last_byte < search_start)
last_byte = search_start;
hole_size = key.offset - last_byte;
+
+ if (hole_size > *max_avail)
+ *max_avail = hole_size;
+
if (key.offset > last_byte &&
hole_size >= num_bytes) {
*start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
device->fs_devices->total_rw_bytes += diff;
device->total_bytes = new_size;
+ device->disk_total_bytes = new_size;
btrfs_clear_space_info_full(device->dev_root->fs_info);
return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
if (ret) {
ret = 0;
- goto done;
+ break;
}
l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid)
- goto done;
+ break;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size);
again:
+ max_avail = 0;
if (!map || map->num_stripes != num_stripes) {
kfree(map);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
if (device->in_fs_metadata && avail >= min_free) {
ret = find_free_dev_extent(trans, device,
- min_free, &dev_offset);
+ min_free, &dev_offset,
+ &max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
}
}
- for (i = 0; i > nr; i++) {
- struct btrfs_multi_bio *multi;
- struct btrfs_bio_stripe *stripe;
- int ret;
-
- length = 1;
- ret = btrfs_map_block(map_tree, WRITE, buf[i],
- &length, &multi, 0);
- BUG_ON(ret);
-
- stripe = multi->stripes;
- for (j = 0; j < multi->num_stripes; j++) {
- if (stripe->physical >= physical &&
- physical < stripe->physical + length)
- break;
- }
- BUG_ON(j >= multi->num_stripes);
- kfree(multi);
- }
-
*logical = buf;
*naddrs = nr;
*stripe_len = map->stripe_len;
@@ -2911,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d3..3e2b90eaa23 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
*total_in = 0;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -1;
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
char *kaddr;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -ENOMEM;
data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
return -ENOMEM;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -ENOMEM;
workspace->inf_strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45b..90a98865b0c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
struct zone *zone;
int nid;
- wakeup_pdflush(1024);
+ wakeup_flusher_threads(1024);
yield();
for_each_online_node(nid) {
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
- if (!TestSetPageDirty(page))
- __set_page_dirty(page, page_mapping(page), 0);
+ if (!TestSetPageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+ if (mapping)
+ __set_page_dirty(page, mapping, 0);
+ }
}
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d..3cbc57f932d 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
* - no readahead or I/O queue unplugging required
*/
struct backing_dev_info directly_mappable_cdev_bdi = {
+ .name = "char",
.capabilities = (
#ifdef CONFIG_MMU
/* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
}
/**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
* @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
* @name: name of this range of devices
* @fops: file operations associated with this devices
*
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
* /dev. It only helps to keep track of the different owners of devices. If
* your module name has only one type of devices it's ok to use e.g. the name
* of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
*/
-int register_chrdev(unsigned int major, const char *name,
- const struct file_operations *fops)
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name,
+ const struct file_operations *fops)
{
struct char_device_struct *cd;
struct cdev *cdev;
char *s;
int err = -ENOMEM;
- cd = __register_chrdev_region(major, 0, 256, name);
+ cd = __register_chrdev_region(major, baseminor, count, name);
if (IS_ERR(cd))
return PTR_ERR(cd);
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
*s = '!';
- err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+ err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
if (err)
goto out;
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
out:
kobject_put(&cdev->kobj);
out2:
- kfree(__unregister_chrdev_region(cd->major, 0, 256));
+ kfree(__unregister_chrdev_region(cd->major, baseminor, count));
return err;
}
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
}
}
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count. This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name)
{
struct char_device_struct *cd;
- cd = __unregister_chrdev_region(major, 0, 256);
+
+ cd = __unregister_chrdev_region(major, baseminor, count);
if (cd && cd->cdev)
cdev_del(cd->cdev);
kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa9074..145540a316a 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect. Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again. Add noforcegid
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
+
Version 1.59
------------
Client uses server inode numbers (which are persistent) rather than
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde..79c1a93400b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
mount.
domain Set the SMB/CIFS workgroup name prepended to the
username during CIFS session establishment
- forceuid Set the default uid for inodes based on the uid
- passed in. For mounts to servers
+ forceuid Set the default uid for inodes to the uid
+ passed in on mount. For mounts to servers
which do support the CIFS Unix extensions, such as a
properly configured Samba server, the server provides
- the uid, gid and mode so this parameter should not be
+ the uid, gid and mode so this parameter should not be
specified unless the server and clients uid and gid
numbering differ. If the server and client are in the
same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
of existing files will be the uid (gid) of the person
who executed the mount (root, except when mount.cifs
is configured setuid for user mounts) unless the "uid="
- (gid) mount option is specified. For the uid (gid) of newly
- created files and directories, ie files created since
- the last mount of the server share, the expected uid
- (gid) is cached as long as the inode remains in
- memory on the client. Also note that permission
+ (gid) mount option is specified. Also note that permission
checks (authorization checks) on accesses to a file occur
at the server, but there are cases in which an administrator
may want to restrict at the client as well. For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
(such as Windows), permissions can also be checked at the
client, and a crude form of client side permission checking
can be enabled by specifying file_mode and dir_mode on
- the client. Note that the mount.cifs helper must be
- at version 1.10 or higher to support specifying the uid
- (or gid) in non-numeric form.
- forcegid (similar to above but for the groupid instead of uid)
+ the client. (default)
+ forcegid (similar to above but for the groupid instead of uid) (default)
+ noforceuid Fill in file owner information (uid) by requesting it from
+ the server if possible. With this option, the value given in
+ the uid= option (on mount) will only be used if the server
+ can not support returning uids on inodes.
+ noforcegid (similar to above but for the group owner, gid, instead of uid)
uid Set the default uid for inodes, and indicate to the
- cifs kernel driver which local user mounted . If the server
+ cifs kernel driver which local user mounted. If the server
supports the unix extensions the default uid is
not used to fill in the owner fields of inodes (files)
unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a..606912d8f2a 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
* i.e. strips from UNC trailing path that is not part of share
* name and fixup missing '\' in the begining of DFS node refferal
* if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
* Caller is responsible for freeing returned string.
*/
static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
GFP_KERNEL);
if (!UNC)
- return NULL;
+ return ERR_PTR(-ENOMEM);
/* get share name and server name */
if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
cERROR(1, ("%s: no server name end in node name: %s",
__func__, node_name));
kfree(UNC);
- return NULL;
+ return ERR_PTR(-EINVAL);
}
/* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
return ERR_PTR(-EINVAL);
*devname = cifs_get_share_name(ref->node_name);
+ if (IS_ERR(*devname)) {
+ rc = PTR_ERR(*devname);
+ *devname = NULL;
+ goto compose_mount_options_err;
+ }
+
rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
if (rc != 0) {
cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d6..8ec7736ce95 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
if (server->addr.sockAddr.sin_family == AF_INET)
sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
else if (server->addr.sockAddr.sin_family == AF_INET6)
- sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+ sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
else
goto out;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de..714a542cbaf 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
int maxwords = maxbytes / 2;
char tmp[NLS_MAX_CHARSET_SIZE];
- for (i = 0; from[i] && i < maxwords; i++) {
+ for (i = 0; i < maxwords && from[i]; i++) {
charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
NLS_MAX_CHARSET_SIZE);
if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a..7dfe0842a6f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
return get_cifs_acl_by_path(cifs_sb, path, pacllen);
pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return pntsd;
}
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return rc;
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f4..7efe1745494 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
compare with the NTLM example */
hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+ kfree(pctxt);
return rc;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82..3610e9958b4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
static int
cifs_show_options(struct seq_file *s, struct vfsmount *m)
{
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *tcon;
-
- cifs_sb = CIFS_SB(m->mnt_sb);
- tcon = cifs_sb->tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+ struct cifsTconInfo *tcon = cifs_sb->tcon;
- seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+ seq_printf(s, ",unc=%s", tcon->treeName);
if (tcon->ses->userName)
seq_printf(s, ",username=%s", tcon->ses->userName);
if (tcon->ses->domainName)
@@ -376,10 +373,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
seq_printf(s, ",forceuid");
+ else
+ seq_printf(s, ",noforceuid");
seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
seq_printf(s, ",forcegid");
+ else
+ seq_printf(s, ",noforcegid");
cifs_show_address(s, tcon->ses->server);
@@ -985,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
if (try_to_freeze())
continue;
- spin_lock(&GlobalMid_Lock);
- if (list_empty(&GlobalOplock_Q)) {
- spin_unlock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
+ if (list_empty(&cifs_oplock_list)) {
+ spin_unlock(&cifs_oplock_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(39*HZ);
} else {
- oplock_item = list_entry(GlobalOplock_Q.next,
+ oplock_item = list_entry(cifs_oplock_list.next,
struct oplock_q_entry, qhead);
cFYI(1, ("found oplock item to write out"));
pTcon = oplock_item->tcon;
inode = oplock_item->pinode;
netfid = oplock_item->netfid;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
DeleteOplockQEntry(oplock_item);
/* can not grab inode sem here since it would
deadlock when oplock received on delete
@@ -1054,7 +1055,7 @@ init_cifs(void)
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
- INIT_LIST_HEAD(&GlobalOplock_Q);
+ INIT_LIST_HEAD(&cifs_oplock_list);
#ifdef CONFIG_CIFS_EXPERIMENTAL
INIT_LIST_HEAD(&GlobalDnotifyReqList);
INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1083,6 +1084,7 @@ init_cifs(void)
rwlock_init(&GlobalSMBSeslock);
rwlock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
+ spin_lock_init(&cifs_oplock_lock);
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300..094325e3f71 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.60"
+#define CIFS_VERSION "1.61"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c0..6cfc81a3270 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
bool closePend:1; /* file is marked to close */
bool invalidHandle:1; /* file closed via session abend */
bool messageMode:1; /* for pipes: message vs byte mode */
- atomic_t wrtPending; /* handle in use - defer close */
+ atomic_t count; /* reference count */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
};
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+ atomic_inc(&cifs_file->count);
+}
+
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+ if (atomic_dec_and_test(&cifs_file->count))
+ kfree(cifs_file);
+}
+
/*
* One of these for each file inode
*/
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
*/
GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d..301e307e127 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
to this tcon */
}
-/* Allocate and return pointer to an SMB request buffer, and set basic
- SMB information in the SMB header. If the return code is zero, this
- function must have filled in request_buf pointer */
+/* reconnect the socket, tcon, and smb session if needed */
static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
- void **request_buf)
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
{
int rc = 0;
+ struct cifsSesInfo *ses;
+ struct TCP_Server_Info *server;
+ struct nls_table *nls_codepage;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
+ /*
+ * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+ * tcp and smb session status done differently for those three - in the
+ * calling routine
+ */
+ if (!tcon)
+ return 0;
+
+ ses = tcon->ses;
+ server = ses->server;
+
+ /*
+ * only tree disconnect, open, and write, (and ulogoff which does not
+ * have tcon) are allowed as we start force umount
+ */
+ if (tcon->tidStatus == CifsExiting) {
+ if (smb_command != SMB_COM_WRITE_ANDX &&
+ smb_command != SMB_COM_OPEN_ANDX &&
+ smb_command != SMB_COM_TREE_DISCONNECT) {
+ cFYI(1, ("can not send cmd %d while umounting",
+ smb_command));
+ return -ENODEV;
}
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes back on-line */
- } else /* TCP session is reestablished now */
- break;
- }
+ }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* we do not know sb */,
- NULL /* no vol info */);
- }
+ if (ses->status == CifsExiting)
+ return -EIO;
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
+ /*
+ * Give demultiplex thread up to 10 seconds to reconnect, should be
+ * greater than cifs socket timeout which is 7 seconds
+ */
+ while (server->tcpStatus == CifsNeedReconnect) {
+ wait_event_interruptible_timeout(server->response_q,
+ (server->tcpStatus == CifsGood), 10 * HZ);
- } else {
- return -EIO;
+ /* is TCP session is reestablished now ?*/
+ if (server->tcpStatus != CifsNeedReconnect)
+ break;
+
+ /*
+ * on "soft" mounts we wait once. Hard mounts keep
+ * retrying until process is killed or server comes
+ * back on-line
+ */
+ if (!tcon->retry || ses->status == CifsExiting) {
+ cFYI(1, ("gave up waiting on reconnect in smb_init"));
+ return -EHOSTDOWN;
}
}
+
+ if (!ses->need_reconnect && !tcon->need_reconnect)
+ return 0;
+
+ nls_codepage = load_nls_default();
+
+ /*
+ * need to prevent multiple threads trying to simultaneously
+ * reconnect the same SMB session
+ */
+ down(&ses->sesSem);
+ if (ses->need_reconnect)
+ rc = cifs_setup_session(0, ses, nls_codepage);
+
+ /* do we need to reconnect tcon? */
+ if (rc || !tcon->need_reconnect) {
+ up(&ses->sesSem);
+ goto out;
+ }
+
+ mark_open_files_invalid(tcon);
+ rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+ up(&ses->sesSem);
+ cFYI(1, ("reconnect tcon rc = %d", rc));
+
+ if (rc)
+ goto out;
+
+ /*
+ * FIXME: check if wsize needs updated due to negotiated smb buffer
+ * size shrinking
+ */
+ atomic_inc(&tconInfoReconnectCount);
+
+ /* tell server Unix caps we support */
+ if (ses->capabilities & CAP_UNIX)
+ reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+ /*
+ * Removed call to reopen open files here. It is safer (and faster) to
+ * reopen files one at a time as needed in read and write.
+ *
+ * FIXME: what about file locks? don't we need to reclaim them ASAP?
+ */
+
+out:
+ /*
+ * Check if handle based operation so we know whether we can continue
+ * or not without returning to caller to reset file handle
+ */
+ switch (smb_command) {
+ case SMB_COM_READ_ANDX:
+ case SMB_COM_WRITE_ANDX:
+ case SMB_COM_CLOSE:
+ case SMB_COM_FIND_CLOSE2:
+ case SMB_COM_LOCKING_ANDX:
+ rc = -EAGAIN;
+ }
+
+ unload_nls(nls_codepage);
+ return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+ SMB information in the SMB header. If the return code is zero, this
+ function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf)
+{
+ int rc = 0;
+
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
{
int rc = 0;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
- }
-
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes on-line */
- } else /* TCP session is reestablished now */
- break;
- }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* do not know sb */,
- NULL /* no vol info */);
- }
-
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
-
- } else {
- return -EIO;
- }
- }
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
if (is_unicode) {
__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
GFP_KERNEL);
+ if (tmp == NULL) {
+ rc = -ENOMEM;
+ goto parse_DFS_referrals_exit;
+ }
cifsConvertToUCS((__le16 *) tmp, searchName,
PATH_MAX, nls_codepage, remap);
node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c875073..d49682433c2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
char *data;
unsigned int temp_len, i, j;
char separator[2];
+ short int override_uid = -1;
+ short int override_gid = -1;
+ bool uid_specified = false;
+ bool gid_specified = false;
separator[0] = ',';
separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
"too long.\n");
return 1;
}
- } else if (strnicmp(data, "uid", 3) == 0) {
- if (value && *value)
- vol->linux_uid =
- simple_strtoul(value, &value, 0);
- } else if (strnicmp(data, "forceuid", 8) == 0) {
- vol->override_uid = 1;
- } else if (strnicmp(data, "gid", 3) == 0) {
- if (value && *value)
- vol->linux_gid =
- simple_strtoul(value, &value, 0);
- } else if (strnicmp(data, "forcegid", 8) == 0) {
- vol->override_gid = 1;
+ } else if (!strnicmp(data, "uid", 3) && value && *value) {
+ vol->linux_uid = simple_strtoul(value, &value, 0);
+ uid_specified = true;
+ } else if (!strnicmp(data, "forceuid", 8)) {
+ override_uid = 1;
+ } else if (!strnicmp(data, "noforceuid", 10)) {
+ override_uid = 0;
+ } else if (!strnicmp(data, "gid", 3) && value && *value) {
+ vol->linux_gid = simple_strtoul(value, &value, 0);
+ gid_specified = true;
+ } else if (!strnicmp(data, "forcegid", 8)) {
+ override_gid = 1;
+ } else if (!strnicmp(data, "noforcegid", 10)) {
+ override_gid = 0;
} else if (strnicmp(data, "file_mode", 4) == 0) {
if (value && *value) {
vol->file_mode =
@@ -1355,11 +1361,23 @@ cifs_parse_mount_options(char *options, const char *devname,
if (vol->UNCip == NULL)
vol->UNCip = &vol->UNC[2];
+ if (uid_specified)
+ vol->override_uid = override_uid;
+ else if (override_uid == 1)
+ printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+ "specified with no uid= option.\n");
+
+ if (gid_specified)
+ vol->override_gid = override_gid;
+ else if (override_gid == 1)
+ printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+ "specified with no gid= option.\n");
+
return 0;
}
static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
{
struct list_head *tmp;
struct TCP_Server_Info *server;
@@ -1379,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
if (server->tcpStatus == CifsNew)
continue;
- if (addr->ss_family == AF_INET &&
- (addr4->sin_addr.s_addr !=
- server->addr.sockAddr.sin_addr.s_addr))
- continue;
- else if (addr->ss_family == AF_INET6 &&
- (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
- &addr6->sin6_addr) ||
- server->addr.sockAddr6.sin6_scope_id !=
- addr6->sin6_scope_id))
- continue;
+ switch (addr->ss_family) {
+ case AF_INET:
+ if (addr4->sin_addr.s_addr ==
+ server->addr.sockAddr.sin_addr.s_addr) {
+ addr4->sin_port = htons(port);
+ /* user overrode default port? */
+ if (addr4->sin_port) {
+ if (addr4->sin_port !=
+ server->addr.sockAddr.sin_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+
+ case AF_INET6:
+ if (ipv6_addr_equal(&addr6->sin6_addr,
+ &server->addr.sockAddr6.sin6_addr) &&
+ (addr6->sin6_scope_id ==
+ server->addr.sockAddr6.sin6_scope_id)) {
+ addr6->sin6_port = htons(port);
+ /* user overrode default port? */
+ if (addr6->sin6_port) {
+ if (addr6->sin6_port !=
+ server->addr.sockAddr6.sin6_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+ }
++server->srv_count;
write_unlock(&cifs_tcp_ses_lock);
@@ -1457,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
}
/* see if we already have a matching tcp_ses */
- tcp_ses = cifs_find_tcp_session(&addr);
+ tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
if (tcp_ses)
return tcp_ses;
@@ -2452,10 +2491,10 @@ try_mount_again:
tcon->local_lease = volume_info->local_lease;
}
if (pSesInfo) {
- if (pSesInfo->capabilities & CAP_LARGE_FILES) {
- sb->s_maxbytes = (u64) 1 << 63;
- } else
- sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+ if (pSesInfo->capabilities & CAP_LARGE_FILES)
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ else
+ sb->s_maxbytes = MAX_NON_LFS;
}
/* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2544,11 +2583,20 @@ remote_path_check:
if (mount_data != mount_data_global)
kfree(mount_data);
+
mount_data = cifs_compose_mount_options(
cifs_sb->mountdata, full_path + 1,
referrals, &fake_devname);
- kfree(fake_devname);
+
free_dfs_info_array(referrals, num_referrals);
+ kfree(fake_devname);
+ kfree(full_path);
+
+ if (IS_ERR(mount_data)) {
+ rc = PTR_ERR(mount_data);
+ mount_data = NULL;
+ goto mount_fail_check;
+ }
if (tcon)
cifs_put_tcon(tcon);
@@ -2556,8 +2604,6 @@ remote_path_check:
cifs_put_smb_ses(pSesInfo);
cleanup_volume_info(&volume_info);
- FreeXid(xid);
- kfree(full_path);
referral_walks_count++;
goto try_mount_again;
}
@@ -2611,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
return -EIO;
smb_buffer = cifs_buf_get();
- if (smb_buffer == NULL) {
+ if (smb_buffer == NULL)
return -ENOMEM;
- }
+
smb_buffer_response = smb_buffer;
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa..a6424cfc012 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
mutex_init(&pCifsFile->fh_mutex);
mutex_init(&pCifsFile->lock_mutex);
INIT_LIST_HEAD(&pCifsFile->llist);
- atomic_set(&pCifsFile->wrtPending, 0);
+ atomic_set(&pCifsFile->count, 1);
/* set the following in open now
pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217..fa7beac8b80 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
private_data->pInode = inode;
private_data->invalidHandle = false;
private_data->closePend = false;
- /* we have to track num writers to the inode, since writepages
- does not tell us which handle the write is for so there can
- be a close (overlapping with write) of the filehandle that
- cifs_writepages chose to use */
- atomic_set(&private_data->wrtPending, 0);
+ /* Initialize reference count to one. The private data is
+ freed on the release of the last reference */
+ atomic_set(&private_data->count, 1);
return private_data;
}
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
if (!pTcon->need_reconnect) {
write_unlock(&GlobalSMBSeslock);
timeout = 2;
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
+ while ((atomic_read(&pSMBFile->count) != 1)
&& (timeout <= 2048)) {
/* Give write a better chance to get to
server ahead of the close. We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
msleep(timeout);
timeout *= 4;
}
- if (atomic_read(&pSMBFile->wrtPending))
- cERROR(1, ("close with pending write"));
if (!pTcon->need_reconnect &&
!pSMBFile->invalidHandle)
rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
list_del(&pSMBFile->flist);
list_del(&pSMBFile->tlist);
write_unlock(&GlobalSMBSeslock);
- timeout = 10;
- /* We waited above to give the SMBWrite a chance to issue
- on the wire (so we do not get SMBWrite returning EBADF
- if writepages is racing with close. Note that writepages
- does not specify a file handle, so it is possible for a file
- to be opened twice, and the application close the "wrong"
- file handle - in these cases we delay long enough to allow
- the SMBWrite to get on the wire before the SMB Close.
- We allow total wait here over 45 seconds, more than
- oplock break time, and more than enough to allow any write
- to complete on the server, or to time out on the client */
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
- && (timeout <= 50000)) {
- cERROR(1, ("writes pending, delay free of handle"));
- msleep(timeout);
- timeout *= 8;
- }
- kfree(file->private_data);
+ cifsFileInfo_put(file->private_data);
file->private_data = NULL;
} else
rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
if (!open_file->invalidHandle) {
/* found a good file */
/* lock it so it will not be closed on us */
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
read_unlock(&GlobalSMBSeslock);
return open_file;
} /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
if (open_file->pfile &&
((open_file->pfile->f_flags & O_RDWR) ||
(open_file->pfile->f_flags & O_WRONLY))) {
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
else { /* start over in case this was deleted */
/* since the list could be modified */
read_lock(&GlobalSMBSeslock);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
goto refind_writable;
}
}
@@ -1309,7 +1288,7 @@ refind_writable:
read_lock(&GlobalSMBSeslock);
/* can not use this handle, no write
pending on this one after all */
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
if (open_file->closePend) /* list could have changed */
goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (open_file) {
bytes_written = cifs_write(open_file->pfile, write_data,
to-from, &offset);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
bytes_to_write, offset,
&bytes_written, iov, n_iov,
long_op);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cifs_update_eof(cifsi, offset, bytes_written);
if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b246..1f09c761931 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
* junction to the new submount (ie to setup the fake directory
* which represents a DFS referral).
*/
-void
+static void
cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
}
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-void
+static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
struct cifs_sb_info *cifs_sb, bool adjust_tz)
{
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc1) {
- /* BB EOPNOSUPP disable SERVER_INUM? */
cFYI(1, ("GetSrvInodeNum rc %d", rc1));
fattr.cf_uniqueid = iunique(sb, ROOT_I);
+ /* disable serverino if call not supported */
+ if (rc1 == -EINVAL)
+ cifs_sb->mnt_cifs_flags &=
+ ~CIFS_MOUNT_SERVER_INUM;
}
} else {
fattr.cf_uniqueid = iunique(sb, ROOT_I);
@@ -797,7 +800,7 @@ set_via_filehandle:
if (open_file == NULL)
CIFSSMBClose(xid, pTcon, netfid);
else
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
out:
return rc;
}
@@ -1632,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
__u32 npid = open_file->pid;
rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
npid, false);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cFYI(1, ("SetFSize for attrs rc = %d", rc));
if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
unsigned int bytes_written;
@@ -1787,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
u16 nfid = open_file->netfid;
u32 npid = open_file->pid;
rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
} else {
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a..1da4ab250ea 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
temp->pinode = pinode;
temp->tcon = tcon;
temp->netfid = fid;
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&temp->qhead, &GlobalOplock_Q);
- spin_unlock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
+ list_add_tail(&temp->qhead, &cifs_oplock_list);
+ spin_unlock(&cifs_oplock_lock);
}
return temp;
-
}
void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
/* should we check if list empty first? */
list_del(&oplockEntry->qhead);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
kmem_cache_free(cifs_oplock_cachep, oplockEntry);
}
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
if (tcon == NULL)
return;
- spin_lock(&GlobalMid_Lock);
- list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+ spin_lock(&cifs_oplock_lock);
+ list_for_each_entry(temp, &cifs_oplock_list, qhead) {
if ((temp->tcon) && (temp->tcon == tcon)) {
list_del(&temp->qhead);
kmem_cache_free(cifs_oplock_cachep, temp);
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
}
static int
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972..6d6f98fe64a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
if (!bprm)
goto out_files;
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ retval = prepare_bprm_creds(bprm);
+ if (retval)
goto out_free;
- current->in_execve = 1;
-
- retval = -ENOMEM;
- bprm->cred = prepare_exec_creds();
- if (!bprm->cred)
- goto out_unlock;
retval = check_unsafe_exec(bprm);
if (retval < 0)
- goto out_unlock;
+ goto out_free;
clear_in_exec = retval;
+ current->in_execve = 1;
file = open_exec(filename);
retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
out_unmark:
if (clear_in_exec)
current->fs->in_exec = 0;
-
-out_unlock:
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
out_free:
free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f28f070a60f..f91fd51b32e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
COMPATIBLE_IOCTL(FIOASYNC)
COMPATIBLE_IOCTL(FIONBIO)
COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
/* 0x00 */
COMPATIBLE_IOCTL(FIBMAP)
COMPATIBLE_IOCTL(FIGETBSZ)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d9..a2f746066c5 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
};
static struct backing_dev_info configfs_backing_dev_info = {
+ .name = "configfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6b..a100fa35a48 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
#include <linux/swap.h>
#include <linux/bootmem.h>
#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
#include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f0388..240cef14fe5 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
#define CF_CONNECT_PENDING 3
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
static inline void lowcomms_connect_sock(struct connection *con)
{
+ if (test_bit(CF_CLOSE, &con->flags))
+ return;
if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
- sock_release(sock);
+ if (dlm_nodeid_to_addr(con->nodeid, &saddr))
goto out_err;
- }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
static void send_to_sock(struct connection *con)
{
int ret = 0;
- ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
if (con->sock == NULL)
goto out_connect;
- sendpage = con->sock->ops->sendpage;
-
spin_lock(&con->writequeue_lock);
for (;;) {
e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
ret = 0;
if (len) {
- ret = sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
if (ret == -EAGAIN || ret == 0) {
cond_resched();
goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
+ clear_bit(CF_WRITE_PENDING, &con->flags);
+ set_bit(CF_CLOSE, &con->flags);
+ if (cancel_work_sync(&con->swork))
+ log_print("canceled swork for node %d", nodeid);
+ if (cancel_work_sync(&con->rwork))
+ log_print("canceled rwork for node %d", nodeid);
clean_one_writequeue(con);
close_connection(con, true);
}
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
con->connect_action(con);
+ set_bit(CF_WRITE_PENDING, &con->flags);
}
- clear_bit(CF_WRITE_PENDING, &con->flags);
- send_to_sock(con);
+ if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+ send_to_sock(con);
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462..55ea369f43a 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
return rv;
}
- return genlmsg_unicast(skb, listener_nlpid);
+ return genlmsg_unicast(&init_net, skb, listener_nlpid);
}
static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb..259525c9abb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
}
(*new_auth_tok)->session_key.encrypted_key_size =
(body_size - (ECRYPTFS_SALT_SIZE + 5));
+ if ((*new_auth_tok)->session_key.encrypted_key_size
+ > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+ printk(KERN_WARNING "Tag 3 packet contains key larger "
+ "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
if (unlikely(data[(*packet_size)++] != 0x04)) {
printk(KERN_WARNING "Unknown version number [%d]\n",
data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
rc = -EINVAL;
goto out;
}
+ if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+ printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+ "expected size\n");
+ rc = -EINVAL;
+ goto out;
+ }
if (data[(*packet_size)++] != 0x62) {
printk(KERN_WARNING "Unrecognizable packet\n");
rc = -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b2..172ceb6edde 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,8 +678,8 @@ exit:
}
EXPORT_SYMBOL(open_exec);
-int kernel_read(struct file *file, unsigned long offset,
- char *addr, unsigned long count)
+int kernel_read(struct file *file, loff_t offset,
+ char *addr, unsigned long count)
{
mm_segment_t old_fs;
loff_t pos = offset;
@@ -1016,6 +1016,35 @@ out:
EXPORT_SYMBOL(flush_old_exec);
/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+ if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ bprm->cred = prepare_exec_creds();
+ if (likely(bprm->cred))
+ return 0;
+
+ mutex_unlock(&current->cred_guard_mutex);
+ return -ENOMEM;
+}
+
+void free_bprm(struct linux_binprm *bprm)
+{
+ free_arg_pages(bprm);
+ if (bprm->cred) {
+ mutex_unlock(&current->cred_guard_mutex);
+ abort_creds(bprm->cred);
+ }
+ kfree(bprm);
+}
+
+/*
* install the new credentials for this executable
*/
void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
-
- /* cred_guard_mutex must be held at least to this point to prevent
+ /*
+ * cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
- * credentials; any time after this it may be unlocked */
-
+ * credentials; any time after this it may be unlocked.
+ */
security_bprm_committed_creds(bprm);
+ mutex_unlock(&current->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
EXPORT_SYMBOL(search_binary_handler);
-void free_bprm(struct linux_binprm *bprm)
-{
- free_arg_pages(bprm);
- if (bprm->cred)
- abort_creds(bprm->cred);
- kfree(bprm);
-}
-
/*
* sys_execve() executes a new program.
*/
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
if (!bprm)
goto out_files;
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ retval = prepare_bprm_creds(bprm);
+ if (retval)
goto out_free;
- current->in_execve = 1;
-
- retval = -ENOMEM;
- bprm->cred = prepare_exec_creds();
- if (!bprm->cred)
- goto out_unlock;
retval = check_unsafe_exec(bprm);
if (retval < 0)
- goto out_unlock;
+ goto out_free;
clear_in_exec = retval;
+ current->in_execve = 1;
file = open_exec(filename);
retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
out_unmark:
if (clear_in_exec)
current->fs->in_exec = 0;
-
-out_unlock:
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
out_free:
free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297ca..a63d44256a7 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return error;
}
-static int
+int
ext2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext2_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898..3ff6cbb9ac4 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
#ifdef CONFIG_EXT2_FS_POSIX_ACL
/* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
#else
#include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl NULL
#define ext2_get_acl NULL
#define ext2_set_acl NULL
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc922..a2f3afd1a1c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
.fiemap = ext2_fiemap,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4..1c1638f873a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
unlock_buffer(bh);
mark_buffer_dirty_inode(bh, inode);
/* We used to sync bh here if IS_SYNC(inode).
- * But we now rely upon generic_osync_inode()
+ * But we now rely upon generic_write_sync()
* and b_inode_buffers. But not for directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f787..23701f289e9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
if (dir_de) {
if (old_dir != new_dir)
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+ else {
+ kunmap(dir_page);
+ page_cache_release(dir_page);
+ }
inode_dec_link_count(old_dir);
}
return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b13..522b15498f4 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
module will be called ext3.
config EXT3_DEFAULTS_TO_ORDERED
- bool "Default to 'data=ordered' in ext3 (legacy option)"
+ bool "Default to 'data=ordered' in ext3"
depends on EXT3_FS
help
- If a filesystem does not explicitly specify a data ordering
- mode, and the journal capability allowed it, ext3 used to
- historically default to 'data=ordered'.
-
- That was a rather unfortunate choice, because it leads to all
- kinds of latency problems, and the 'data=writeback' mode is more
- appropriate these days.
-
- You should probably always answer 'n' here, and if you really
- want to use 'data=ordered' mode, set it in the filesystem itself
- with 'tune2fs -o journal_data_ordered'.
-
- But if you really want to enable the legacy default, you can do
- so by answering 'y' to this question.
+ The journal mode options for ext3 have different tradeoffs
+ between when data is guaranteed to be on disk and
+ performance. The use of "data=writeback" can cause
+ unwritten data to appear in files after an system crash or
+ power failure, which can be a security issue. However,
+ "data=ordered" mode can also result in major performance
+ problems, including seconds-long delays before an fsync()
+ call returns. For details, see:
+
+ http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+ If you have been historically happy with ext3's performance,
+ data=ordered mode will be a safe choice and you should
+ answer 'y' here. If you understand the reliability and data
+ privacy issues of data=writeback and are willing to make
+ that trade off, answer 'n'.
config EXT3_FS_XATTR
bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef..c9b0df376b5 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext3_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext3_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext3_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext3_new_inode.
*
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a596..597334626de 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
#ifdef CONFIG_EXT3_FS_POSIX_ACL
/* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
static inline int
ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882..373fa90c796 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext3_get_blocks_handle(NULL, inode, blk, 1,
- &map_bh, 0, 0);
+ err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231..388bbdfa0b4 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
-
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext3_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext3_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
-}
-
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
+ .aio_write = generic_file_aio_write,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
.fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e1..451d166bbe9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
*/
#include <linux/time.h>
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
}
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ goto flush;
/*
* The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ goto out;
}
+flush:
+ /*
+ * In case we didn't commit a transaction, we have to flush
+ * disk caches manually so that data really is on persistent
+ * storage
+ */
+ if (test_opt(inode->i_sb, BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
out:
return ret;
}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c75..cd098a7b77f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
@@ -788,7 +799,7 @@ err_out:
int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
sector_t iblock, unsigned long maxblocks,
struct buffer_head *bh_result,
- int create, int extend_disksize)
+ int create)
{
int err = -EIO;
int offsets[4];
@@ -911,13 +922,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (!err)
err = ext3_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- /*
- * i_disksize growing is protected by truncate_mutex. Don't forget to
- * protect it if you're about to implement concurrent
- * ext3_get_block() -bzzz
- */
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
@@ -972,7 +976,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
}
ret = ext3_get_blocks_handle(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1005,7 +1009,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext3_get_blocks_handle(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create);
/*
* ext3_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1193,15 +1197,16 @@ write_begin_failed:
* i_size_read because we hold i_mutex.
*
* Add inode to orphan list in case we crash before truncate
- * finishes.
+ * finishes. Do this only if ext3_can_truncate() agrees so
+ * that orphan processing code is happy.
*/
- if (pos + len > inode->i_size)
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ ext3_truncate(inode);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1287,7 +1292,7 @@ static int ext3_ordered_write_end(struct file *file,
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
- if (pos + len > inode->i_size)
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
ret2 = ext3_journal_stop(handle);
if (!ret)
@@ -1296,7 +1301,7 @@ static int ext3_ordered_write_end(struct file *file,
page_cache_release(page);
if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ ext3_truncate(inode);
return ret ? ret : copied;
}
@@ -1315,14 +1320,14 @@ static int ext3_writeback_write_end(struct file *file,
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
- if (pos + len > inode->i_size)
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
ret = ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ ext3_truncate(inode);
return ret ? ret : copied;
}
@@ -1358,7 +1363,7 @@ static int ext3_journalled_write_end(struct file *file,
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
- if (pos + len > inode->i_size)
+ if (pos + len > inode->i_size && ext3_can_truncate(inode))
ext3_orphan_add(handle, inode);
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1380,7 @@ static int ext3_journalled_write_end(struct file *file,
page_cache_release(page);
if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ ext3_truncate(inode);
return ret ? ret : copied;
}
@@ -2078,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
ext3_journal_dirty_metadata(handle, bh);
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext3_journal_get_write_access(handle, bh);
@@ -2288,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
ext3_free_blocks(handle, inode, nr, 1);
@@ -2898,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
+
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT3_STATE_NEW)
@@ -2957,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2989,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b973023..aad6400c9b7 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c629..a8d80a7f110 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
#endif
}
+static char *data_mode_string(unsigned long mode)
+{
+ switch (mode) {
+ case EXT3_MOUNT_JOURNAL_DATA:
+ return "journal";
+ case EXT3_MOUNT_ORDERED_DATA:
+ return "ordered";
+ case EXT3_MOUNT_WRITEBACK_DATA:
+ return "writeback";
+ }
+ return "unknown";
+}
+
/*
* Show an option if
* - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
- if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
+ seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
+ EXT3_MOUNT_DATA_FLAGS));
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
datacheck:
if (is_remount) {
if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
- != data_opt) {
- printk(KERN_ERR
- "EXT3-fs: cannot change data "
- "mode on remount\n");
- return 0;
- }
+ == data_opt)
+ break;
+ printk(KERN_ERR
+ "EXT3-fs (device %s): Cannot change "
+ "data mode on remount. The filesystem "
+ "is mounted in data=%s mode and you "
+ "try to remount it in data=%s mode.\n",
+ sb->s_id,
+ data_mode_string(sbi->s_mount_opt &
+ EXT3_MOUNT_DATA_FLAGS),
+ data_mode_string(data_opt));
+ return 0;
} else {
sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
sbi->s_mount_opt |= data_opt;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae..d5c0ea2e8f2 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
To enable backwards compatibility so that systems that are
still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
+ choose Y here. This feature will go away by 2.6.31, so
please arrange to get your userspace programs fixed!
config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149c..0df88b2a69b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext4_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext4_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext4_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba..9d843d5deac 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
extern int ext4_acl_chmod(struct inode *);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
static inline int
ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff..1d0418980f8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);
+ grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393ef..e227eea23f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
/* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE 1
+#define EXT4_MB_HINT_MERGE 0x0001
/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 2
+#define EXT4_MB_HINT_RESERVED 0x0002
/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 4
+#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
-#define EXT4_MB_HINT_FIRST 8
+#define EXT4_MB_HINT_FIRST 0x0008
/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 16
+#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
-#define EXT4_MB_HINT_DATA 32
+#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC 64
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
/* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC 128
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
/* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY 256
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
/* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL 512
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
/* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED 1024
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
};
/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+
+/*
* Special inodes numbers
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
#endif
};
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -456,7 +476,6 @@ struct move_extent {
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
-#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
ext4_fsblk_t, unsigned long, int, unsigned long *);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
- ext4_grpblk_t add);
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- unsigned short bb_counters[];
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
};
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
ext4_group_t group)
{
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spin_lock(ext4_group_lock_ptr(sb, group));
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
}
static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10..61652f1d15e 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
#define EXT_BREAK 1
#define EXT_REPEAT 2
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
#define EXT_MAX_BLOCK 0xffffffff
/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee..6a9409920de 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- mark_buffer_dirty(bh);
+ if (inode && bh)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad7..7a383257792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
err = ext4_journal_extend(handle, needed);
if (err <= 0)
return err;
- return ext4_journal_restart(handle, needed);
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ /*
+ * We have dropped i_data_sem so someone might have cached again
+ * an extent we are going to truncate.
+ */
+ ext4_ext_invalidate_cache(inode);
+
+ return err;
}
/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
return newblock;
}
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (size > 6)
+ size = 6;
#endif
+ }
return size;
}
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (size > 5)
+ size = 5;
#endif
+ }
return size;
}
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (size > 3)
+ size = 3;
#endif
+ }
return size;
}
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (size > 4)
+ size = 4;
#endif
+ }
return size;
}
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
int lcap, icap, rcap, leafs, idxs, num;
int newextents = blocks;
- rcap = ext4_ext_space_root_idx(inode);
- lcap = ext4_ext_space_block(inode);
- icap = ext4_ext_space_block_idx(inode);
+ rcap = ext4_ext_space_root_idx(inode, 0);
+ lcap = ext4_ext_space_block(inode, 0);
+ icap = ext4_ext_space_block_idx(inode, 0);
/* number of new leaf blocks needed */
num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
if (depth == ext_depth(inode)) {
if (depth == 0)
- max = ext4_ext_space_root(inode);
+ max = ext4_ext_space_root(inode, 1);
else
- max = ext4_ext_space_root_idx(inode);
+ max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
- max = ext4_ext_space_block(inode);
+ max = ext4_ext_space_block(inode, 1);
else
- max = ext4_ext_space_block_idx(inode);
+ max = ext4_ext_space_block_idx(inode, 1);
}
return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:%d:%llu ",
+ ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:%d ",
+ ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
ext4_ext_invalidate_cache(inode);
return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
path[depth].p_ext++;
while (path[depth].p_ext <=
EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:%d in new leaf %llu\n",
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
+ ext4_ext_is_uninitialized(path[depth].p_ext),
ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
goto out;
curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+ curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
curp->p_hdr->eh_entries = cpu_to_le16(1);
curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append %d block to %d:%d (from %llu)\n",
+ ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -1651,9 +1676,10 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
len = EXT_MAX_EXTENT(eh) - nearex;
len = (len - 1) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
BUG_ON(newext->ee_block == nearex->ee_block);
len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
else
uninitialized = 0;
- ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ uninitialized, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
}
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
- err = ext4_ext_journal_restart(handle, credits);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
goto out;
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (err == 0) {
ext_inode_hdr(inode)->eh_depth = 0;
ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(inode));
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
err = ext4_ext_dirty(handle, inode, path);
}
}
@@ -2743,6 +2772,7 @@ insert:
} else if (err)
goto fix_extent_len;
out:
+ ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
struct ext4_allocation_request ar;
__clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%u requested for inode %u\n",
+ ext_debug("blocks %u/%u requested for inode %lu\n",
iblock, max_blocks, inode->i_ino);
/* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+ ext_debug("%u fit into %u:%d -> %llu\n", iblock,
ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
/* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c..5ca3eca70a1 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext4_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext4_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static struct vm_operations_struct ext4_file_vm_ops = {
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fallocate = ext4_fallocate,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f59..07475740b51 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0;
+ int err, ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
+ if (!journal)
+ ret = sync_mapping_buffers(inode->i_mapping);
+
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
};
- ret = sync_inode(inode, &wbc);
- if (journal && (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ err = sync_inode(inode, &wbc);
+ if (ret == 0)
+ ret = err;
}
out:
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b..f3624ead4f6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, ext4_free_inodes_count(sb, gdp), x);
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22ef..4abd683b963 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ return ret;
}
/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
int n = 0;
int final = 0;
- if (i_block < 0) {
- ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
+ if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
} else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
*
* Normally this function find the preferred place for block allocation,
* returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
+ ext4_fsblk_t goal;
+
/*
* XXX need to get goal block from mballoc's data structures
*/
- return ext4_find_near(inode, partial);
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
}
/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
+ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
+ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
if (*err && (target == blks)) {
/*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
+ /* Don't brelse(bh) here; it's done in
+ * ext4_journal_forget() below */
unlock_buffer(bh);
- brelse(bh);
goto failed;
}
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, sector_t logical,
- sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+ sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, "check_block_validity",
+ ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
(unsigned long long) phys, len);
- WARN_ON(1);
return -EIO;
}
return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system corruption",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
}
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system "
+ "corruption after allocation",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
long pages_skipped;
int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
mpd.io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
+ trace_ext4_da_write_pages(inode, &mpd);
wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -3117,6 +3128,8 @@ out:
*/
int ext4_alloc_da_blocks(struct inode *inode)
{
+ trace_ext4_alloc_da_blocks(inode);
+
if (!EXT4_I(inode)->i_reserved_data_blocks &&
!EXT4_I(inode)->i_reserved_meta_blocks)
return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
}
ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (ei->i_disksize && inode->i_size == 0 &&
- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc)
+ struct ext4_iloc *iloc,
+ int do_sync)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_inode_blocks_set(handle, raw_inode, ei))
goto out_brelse;
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- /* clear the migrate flag in the raw_inode */
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_HURD))
raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
+ /*
+ * If we're not using a journal and we were called from
+ * ext4_write_inode() to sync the inode (making do_sync true),
+ * we can just use sync_dirty_buffer() directly to do our dirty
+ * work. Testing s_journal here is a bit redundant but it's
+ * worth it to avoid potential future trouble.
+ */
+ if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+ BUFFER_TRACE(bh, "call sync_dirty_buffer");
+ sync_dirty_buffer(bh);
+ } else {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
+ }
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
*/
int ext4_write_inode(struct inode *inode, int wait)
{
+ int err;
+
if (current->flags & PF_MEMALLOC)
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
- if (!wait)
- return 0;
+ if (!wait)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
- return ext4_force_commit(inode->i_sb);
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+ err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+ inode, &iloc, wait);
+ }
+ return err;
}
/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc);
+ err = ext4_do_update_inode(handle, inode, iloc, 0);
put_bh(iloc->bh);
return err;
}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
len = PAGE_CACHE_SIZE;
+ lock_page(page);
+ /*
+ * return if we have all the buffers mapped. This avoid
+ * the need to call write_begin/write_end which does a
+ * journal_start/journal_stop which can block and take
+ * long time
+ */
if (page_has_buffers(page)) {
- /* return if we have all the buffers mapped */
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped))
+ ext4_bh_unmapped)) {
+ unlock_page(page);
goto out_unlock;
+ }
}
+ unlock_page(page);
/*
* OK, we need to fill the hole... Do write_begin write_end
* to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a..c1cdf613e72 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
me.donor_start, me.len, &me.moved_len);
fput(donor_filp);
- if (!err)
- if (copy_to_user((struct move_extent *)arg,
- &me, sizeof(me)))
- return -EFAULT;
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ return -EFAULT;
+
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a..e9c61896d60 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
*/
#include "mballoc.h"
+#include <linux/debugfs.h>
#include <trace/events/ext4.h>
/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
- void *buddy, unsigned first, int len,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned short min;
- unsigned short max;
- unsigned short chunk;
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
unsigned short border;
BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
- unsigned short i = 0;
- unsigned short first;
- unsigned short len;
+ ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
char *data;
char *bitmap;
- mb_debug("init page %lu\n", page->index);
+ mb_debug(1, "init page %lu\n", page->index);
inode = page->mapping->host;
sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
set_bitmap_uptodate(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", first_group + i);
}
/* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug("put buddy for group %u in page %lu/%x\n",
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
- sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
/*
* incore got set to the group block bitmap below
*/
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug("put bitmap for group %u in page %lu/%x\n",
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
/* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
return err;
}
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret = 0;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug(1, "init group %u\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have taken the alloc_sem lock.
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %u\n", group);
+ mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* groups mapped by the page is blocked
* till we are done with allocation
*/
+repeat_load_buddy:
down_read(e4b->alloc_semp);
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /* we need to check for group need init flag
+ * with alloc_semp held so that we can be sure
+ * that new blocks didn't get added to the group
+ * when we are loading the buddy cache
+ */
+ up_read(e4b->alloc_semp);
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ goto repeat_load_buddy;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
}
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
- int ret;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
- struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
-
- mb_debug("init group %lu\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- this_grp = ext4_get_group_info(sb, group);
- /*
- * This ensures we don't add group
- * to this buddy cache via resize
- */
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
- /*
- * somebody initialized the group
- * return without doing anything
- */
- ret = 0;
- goto err;
- }
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
-
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
- /*
- * If both the bitmap and buddy are in
- * the same page we don't need to force
- * init the buddy
- */
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
-err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
- return ret;
-}
-
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
bsbits = ac->ac_sb->s_blocksize_bits;
- /* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
+
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2015,27 +2037,6 @@ repeat:
if (grp->bb_free == 0)
continue;
- /*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
- */
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_init_group(sb, group);
- if (err)
- goto out;
- }
-
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
- if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+ "%-5s %-2s %-6s %-5s %-5s %-6s\n",
"pid", "inode", "original", "goal", "result", "found",
"grps", "cr", "flags", "merge", "tail", "broken");
return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (hs->op == EXT4_MB_HISTORY_ALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "%-5u %-5s %-5u %-6u\n";
+ "0x%04x %-5s %-5u %-6u\n";
sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
.start = ext4_mb_seq_history_start,
.next = ext4_mb_seq_history_next,
.stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
return count;
}
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_history_open,
.read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_buddy e4b;
struct sg {
struct ext4_group_info info;
- unsigned short counters[16];
+ ext4_grpblk_t counters[16];
} sg;
group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root.rb_node = NULL;;
+ meta_group_info[i]->bb_free_root.rb_node = NULL;
#ifdef DOUBLE_CHECK
{
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
- grp->bb_free += add;
-}
-
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
- struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freesgi;
}
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (ngroups -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
while (i-- > 0)
kfree(ext4_get_group_info(sb, i));
i = num_meta_group_infos;
-err_freemeta:
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
- mb_debug("mballoc: %u PAs left\n", count);
+ mb_debug(1, "mballoc: %u PAs left\n", count);
}
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
- mb_debug("gonna free %u blocks in group %u (0x%p):",
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
ext4_mb_release_desc(&e4b);
}
- mb_debug("freed %u blocks in %u structures\n", count, count2);
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+ debugfs_dir = debugfs_create_dir("ext4", NULL);
+ if (debugfs_dir)
+ debugfs_debug = debugfs_create_u8("mballoc-debug",
+ S_IRUGO | S_IWUSR,
+ debugfs_dir,
+ &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+ debugfs_remove(debugfs_debug);
+ debugfs_remove(debugfs_dir);
}
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
+ ext4_create_debugfs_entry();
return 0;
}
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_ext_cachep);
+ ext4_remove_debugfs_entry();
}
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
else
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug("#%u: goal %u blocks for locality group\n",
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
ac->ac_o_ex.fe_logical < pa->pa_lstart));
- /* skip PA normalized request doesn't overlap with */
- if (pa->pa_lstart >= end) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- if (pa_end <= start) {
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
spin_unlock(&pa->pa_lock);
continue;
}
BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+ /* adjust start or end to be adjacent to this pa */
if (pa_end <= ac->ac_o_ex.fe_logical) {
BUG_ON(pa_end < start);
start = pa_end;
- }
-
- if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
BUG_ON(pa->pa_lstart > end);
end = pa->pa_lstart;
}
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
}
/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
}
/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
continue;
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
/* found preallocated blocks, use them */
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
preallocated += len;
count++;
}
- mb_debug("prellocated %u for group %u\n", preallocated, group);
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
- mb_debug(" free preallocated %u/%u in group %u\n",
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
(unsigned) start, (unsigned) next - bit,
(unsigned) group);
free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %u\n", group);
+ mb_debug(1, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
return;
}
- mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
{
BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
NULL, &start);
spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%lu:%d:%u \n", i,
- start, pa->pa_len);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
}
ext4_unlock_group(sb, i);
if (grp->bb_free == 0)
continue;
- printk(KERN_ERR "%lu: %d/%d \n",
+ printk(KERN_ERR "%u: %d/%d \n",
i, grp->bb_free, grp->bb_fragments);
}
printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
size = max(size, isize);
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ /* don't use group allocation for large files */
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
- mb_debug("discard locality group preallocation\n");
+ mb_debug(1, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f..188d3d709b2 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
/*
*/
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...) printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= mb_enable_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
#else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
#endif
/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
unsigned pa_deleted;
ext4_fsblk_t pa_pstart; /* phys. block */
ext4_lblk_t pa_lstart; /* log. block */
- unsigned short pa_len; /* len of preallocated chunk */
- unsigned short pa_free; /* how many blocks are free */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start;
ext4_group_t fe_group;
- int fe_len;
+ ext4_grpblk_t fe_len;
};
/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b3974..bf519f239ae 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if EXT4_EXT_MIGRATE is cleared a block allocation
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block allocation
- * via mmap write to holes. If we have allocated new blocks we fail
- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
- * The flag is updated with i_data_sem held to prevent racing with
- * block allocation.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
-
+ unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404d..c07a2915e40 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
#include "ext4_extents.h"
#include "ext4.h"
-#define get_ext_path(path, inode, block, ret) \
- do { \
- path = ext4_ext_find_extent(inode, block, path); \
- if (IS_ERR(path)) { \
- ret = PTR_ERR(path); \
- path = NULL; \
- } \
- } while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
/**
* copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function)
+{
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ ext4_error(inode2->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ ext4_error(inode1->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/**
* mext_double_down_read - Acquire two inodes' read semaphore
*
* @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_read(&EXT4_I(orig_inode)->i_data_sem);
up_read(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (new_flag) {
- get_ext_path(orig_path, orig_inode, eblock, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (end_flag) {
- get_ext_path(orig_path, orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* oext |-----------|
* new_ext |-------|
*/
- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
/*
* Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
o_end, &start_ext, &new_ext, &end_ext);
+out:
return ret;
}
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
* @max_count: the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
*/
-static void
+static int
mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
ext4_lblk_t diff, orig_diff;
struct ext4_extent dext_old, oext_old;
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
dext_old = *tmp_dext;
oext_old = *tmp_oext;
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
copy_extent_status(&oext_old, tmp_dext);
copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
}
/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
mext_double_down_write(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
/* Get the donor extent for the head */
- get_ext_path(donor_path, donor_inode, donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
+ if (err)
+ goto out;
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+ err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ err = -EIO;
+ goto out;
+ }
/* Set donor extent to orig extent */
err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (donor_path)
ext4_ext_drop_refs(donor_path);
- get_ext_path(donor_path, donor_inode,
- donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
}
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off,
- count - replaced_count);
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (err)
+ goto out;
}
out:
@@ -740,7 +815,7 @@ out:
* on success, or a negative error value on failure.
*/
static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
int block_len_in_page, int uninit)
{
@@ -871,6 +946,7 @@ out:
if (PageLocked(page))
unlock_page(page);
page_cache_release(page);
+ ext4_journal_stop(handle);
}
out2:
ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
__u64 donor_start, __u64 *len, __u64 moved_len)
{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if ((orig_start > MAX_DEFRAG_SIZE) ||
- (donor_start > MAX_DEFRAG_SIZE) ||
- (*len > MAX_DEFRAG_SIZE) ||
- (orig_start + *len > MAX_DEFRAG_SIZE)) {
- ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
- "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ if ((orig_start > EXT_MAX_BLOCK) ||
+ (donor_start > EXT_MAX_BLOCK) ||
+ (*len > EXT_MAX_BLOCK) ||
+ (orig_start + *len > EXT_MAX_BLOCK)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_inode->i_size > donor_inode->i_size) {
- if (orig_start >= donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file size "
- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
- orig_start, donor_inode->i_size,
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > donor_inode->i_size) {
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file size [%lld]."
- "So adjust length from %llu to %lld "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
"[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_inode->i_size,
- *len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_inode->i_size - orig_start;
+ *len = donor_blocks - orig_start;
}
} else {
- if (orig_start >= orig_inode->i_size) {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file size "
- "[%lld] [inode:orig %lu, donor %lu]\n",
- orig_start, orig_inode->i_size,
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > orig_inode->i_size) {
+ if (orig_start + *len > orig_blocks) {
ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %lld. Because it should be "
- "less than original file size "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
"[ino:orig %lu, donor %lu]\n",
- *len, orig_inode->i_size - orig_start,
+ *len, orig_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_inode->i_size - orig_start;
+ *len = orig_blocks - orig_start;
}
}
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
- if (inode1)
- mutex_lock(&inode1->i_mutex);
- else if (inode2)
- mutex_lock(&inode2->i_mutex);
- return;
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
}
if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
+
+out:
+ return ret;
}
/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
if (inode1)
mutex_unlock(&inode1->i_mutex);
if (inode2 && inode2 != inode1)
mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
}
/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ int ret1, ret2, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
/* protect orig and donor against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
mext_double_down_read(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len, *moved_len);
mext_double_up_read(orig_inode, donor_inode);
- if (ret)
- goto out2;
+ if (ret1)
+ goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
block_end = block_start + len - 1;
if (file_end < block_end)
len -= block_end - file_end;
- get_ext_path(orig_path, orig_inode, block_start, ret);
- if (orig_path == NULL)
- goto out2;
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
/* Get path structure to check the hole */
- get_ext_path(holecheck_path, orig_inode, block_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
goto out;
depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL) {
- ret = -EINVAL;
- goto out;
- }
/*
- * Get proper extent whose ee_block is beyond block_start
- * if block_start was within the hole.
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
- }
- seq_start = block_start;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
/* No blocks within the specified range. */
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret = -EINVAL;
+ ret1 = -EINVAL;
goto out;
}
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret = move_extent_par_page(o_filp, donor_inode,
+ ret1 = move_extent_per_page(o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit);
- if (ret < 0)
+ if (ret1 < 0)
goto out;
orig_page_offset++;
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- BUG_ON(*moved_len > len);
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ goto out;
+ }
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- get_ext_path(holecheck_path, orig_inode,
- seq_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, seq_start, ret);
- if (orig_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-out2:
- mext_inode_double_unlock(orig_inode, donor_inode);
- if (ret)
- return ret;
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
- /* All of the specified blocks must be exchanged in succeed */
- BUG_ON(*moved_len != len);
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16f..42f81d285cd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
- return make_indexed_dir(handle, dentry, inode, bh);
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ retval = make_indexed_dir(handle, dentry, inode, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
+ }
brelse(bh);
}
bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
}
/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
+ if (err != -ENOSPC)
+ bh = NULL;
goto cleanup;
journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
/*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ EXT4_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
@@ -2536,7 +2544,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fiemap = ext4_fiemap,
};
@@ -2548,5 +2556,5 @@ const struct inode_operations ext4_special_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc64..3cfc343c41b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
struct inode *inode = NULL;
handle_t *handle;
int gdb_off, gdb_num;
- int num_grp_locked = 0;
int err, err2;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* using the new disk blocks.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* descriptor
*/
err = ext4_mb_add_groupinfo(sb, input->group, gdp);
- if (err) {
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+ if (err)
goto exit_journal;
- }
/*
* Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
ext4_handle_dirty_metadata(handle, NULL, primary);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9..a6b1ab73472 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "mballoc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
*journal_devnum = option;
break;
case Opt_journal_checksum:
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
+ break; /* Kept for backwards compatibility */
case Opt_journal_async_commit:
set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
- ext4_free_inodes_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
- ext4_free_blks_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
- ext4_used_dirs_count(sb, gdp));
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_blks_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
.release = ext4_sb_release,
};
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ return 1;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int db_count;
unsigned int i;
int needs_recovery, has_huge_files;
- int features;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
- ~EXT4_FEATURE_INCOMPAT_SUPP));
- goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount RDWR because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (has_huge_files) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LBDAF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LBDAF");
- goto failed_mount;
- }
- }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ if ((ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+ (ext4_blocks_count(es) >
+ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
ext4_msg(sb, KERN_ERR, "filesystem"
- " too large to mount safely");
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = -EFBIG;
goto failed_mount;
}
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ jbd2_journal_set_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ else
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
} else {
- int ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- ext4_msg(sb, KERN_WARNING, "couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x)",
- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
-
/*
* Make sure the group descriptor checksums
* are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c24699..fed5b01d7a8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
goal, NULL, &error);
if (error)
goto cleanup;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95..e8c159de236 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- if (IS_SYNC(inode))
- err = sync_page_range_nolock(inode, mapping, start, count);
+ if (IS_SYNC(inode)) {
+ int err2;
+
+ /*
+ * Opencode syncing since we don't have a file open to use
+ * standard fsync path.
+ */
+ err = filemap_fdatawrite_range(mapping, start,
+ start + count - 1);
+ err2 = sync_mapping_buffers(mapping);
+ if (!err)
+ err = err2;
+ err2 = write_inode_now(inode, 1);
+ if (!err)
+ err = err2;
+ if (!err) {
+ err = filemap_fdatawait_range(mapping, start,
+ start + count - 1);
+ }
+ }
out:
return err;
}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd..4e35be873e0 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
MSDOS_I(inode)->i_start = new_dclus;
MSDOS_I(inode)->i_logstart = new_dclus;
/*
- * Since generic_osync_inode() synchronize later if
- * this is not directory, we don't here.
+ * Since generic_write_sync() synchronizes regular files later,
+ * we sync here only directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be529..8e1e5e19d21 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include "internal.h"
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue. Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own. Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+ long nr_pages;
+ struct super_block *sb;
+ enum writeback_sync_modes sync_mode;
+ int for_kupdate;
+ int range_cyclic;
+};
+
+/*
+ * Work items for the bdi_writeback threads
*/
-static int writeback_acquire(struct backing_dev_info *bdi)
+struct bdi_work {
+ struct list_head list; /* pending work list */
+ struct rcu_head rcu_head; /* for RCU free/clear of work */
+
+ unsigned long seen; /* threads that have seen this work */
+ atomic_t pending; /* number of threads still to do work */
+
+ struct wb_writeback_args args; /* writeback arguments */
+
+ unsigned long state; /* flag bits, see WS_* */
+};
+
+enum {
+ WS_USED_B = 0,
+ WS_ONSTACK_B,
+};
+
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+ return test_bit(WS_ONSTACK_B, &work->state);
+}
+
+static inline void bdi_work_init(struct bdi_work *work,
+ struct wb_writeback_args *args)
{
- return !test_and_set_bit(BDI_pdflush, &bdi->state);
+ INIT_RCU_HEAD(&work->rcu_head);
+ work->args = *args;
+ work->state = WS_USED;
}
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
*
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
*/
int writeback_in_progress(struct backing_dev_info *bdi)
{
- return test_bit(BDI_pdflush, &bdi->state);
+ return !list_empty(&bdi->work_list);
}
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
+static void bdi_work_clear(struct bdi_work *work)
{
- BUG_ON(!writeback_in_progress(bdi));
- clear_bit(BDI_pdflush, &bdi->state);
+ clear_bit(WS_USED_B, &work->state);
+ smp_mb__after_clear_bit();
+ /*
+ * work can have disappeared at this point. bit waitq functions
+ * should be able to tolerate this, provided bdi_sched_wait does
+ * not dereference it's pointer argument.
+ */
+ wake_up_bit(&work->state, WS_USED_B);
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
+ struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
+ if (!bdi_work_on_stack(work))
+ kfree(work);
+ else
+ bdi_work_clear(work);
}
-/**
- * __mark_inode_dirty - internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages. This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
+static void wb_work_complete(struct bdi_work *work)
{
- struct super_block *sb = inode->i_sb;
+ const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+ int onstack = bdi_work_on_stack(work);
/*
- * Don't do this for I_DIRTY_PAGES - that doesn't actually
- * dirty the inode itself
+ * For allocated work, we can clear the done/seen bit right here.
+ * For on-stack work, we need to postpone both the clear and free
+ * to after the RCU grace period, since the stack could be invalidated
+ * as soon as bdi_work_clear() has done the wakeup.
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
- }
+ if (!onstack)
+ bdi_work_clear(work);
+ if (sync_mode == WB_SYNC_NONE || onstack)
+ call_rcu(&work->rcu_head, bdi_work_free);
+}
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
/*
- * make sure that changes are seen by all cpus before we test i_state
- * -- mikulas
+ * The caller has retrieved the work arguments from this work,
+ * drop our reference. If this is the last ref, delete and free it
*/
- smp_mb();
+ if (atomic_dec_and_test(&work->pending)) {
+ struct backing_dev_info *bdi = wb->bdi;
- /* avoid the locking if we can */
- if ((inode->i_state & flags) == flags)
- return;
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&work->list);
+ spin_unlock(&bdi->wb_lock);
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
+ wb_work_complete(work);
+ }
+}
- spin_lock(&inode_lock);
- if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+ work->seen = bdi->wb_mask;
+ BUG_ON(!work->seen);
+ atomic_set(&work->pending, bdi->wb_cnt);
+ BUG_ON(!bdi->wb_cnt);
- inode->i_state |= flags;
+ /*
+ * list_add_tail_rcu() contains the necessary barriers to
+ * make sure the above stores are seen before the item is
+ * noticed on the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&work->list, &bdi->work_list);
+ spin_unlock(&bdi->wb_lock);
- /*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
- */
- if (inode->i_state & I_SYNC)
- goto out;
+ /*
+ * If the default thread isn't there, make sure we add it. When
+ * it gets created and wakes up, we'll run this work.
+ */
+ if (unlikely(list_empty_careful(&bdi->wb_list)))
+ wake_up_process(default_backing_dev_info.wb.task);
+ else {
+ struct bdi_writeback *wb = &bdi->wb;
- /*
- * Only add valid (hashed) inodes to the superblock's
- * dirty list. Add blockdev inodes as well.
- */
- if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
- goto out;
- }
- if (inode->i_state & (I_FREEING|I_CLEAR))
- goto out;
+ if (wb->task)
+ wake_up_process(wb->task);
+ }
+}
- /*
- * If the inode was already on s_dirty/s_io/s_more_io, don't
- * reposition it (that would break s_dirty time-ordering).
- */
- if (!was_dirty) {
- inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
- }
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+ wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_args *args)
+{
+ struct bdi_work *work;
+
+ /*
+ * This is WB_SYNC_NONE writeback, so if allocation fails just
+ * wakeup the thread for old dirty data writeback
+ */
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ bdi_work_init(work, args);
+ bdi_queue_work(bdi, work);
+ } else {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ if (wb->task)
+ wake_up_process(wb->task);
}
-out:
- spin_unlock(&inode_lock);
}
-EXPORT_SYMBOL(__mark_inode_dirty);
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ * This does WB_SYNC_ALL data integrity writeback and waits for the
+ * IO to complete. Callers must hold the sb s_umount semaphore for
+ * reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+ struct super_block *sb)
+{
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ };
+ struct bdi_work work;
-static int write_inode(struct inode *inode, int sync)
+ bdi_work_init(&work, &args);
+ work.state |= WS_ONSTACK;
+
+ bdi_queue_work(bdi, &work);
+ bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * started when this function returns, we make no guarentees on
+ * completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, sync);
- return 0;
+ struct wb_writeback_args args = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_pages = nr_pages,
+ .range_cyclic = 1,
+ };
+
+ bdi_alloc_queue_work(bdi, &args);
}
/*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
* furthest end of its superblock's dirty-inode list.
*
* Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list. If that is
+ * already the most-recently-dirtied inode on the b_dirty list. If that is
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- if (!list_empty(&sb->s_dirty)) {
- struct inode *tail_inode;
+ if (!list_empty(&wb->b_dirty)) {
+ struct inode *tail;
- tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
- if (time_before(inode->dirtied_when,
- tail_inode->dirtied_when))
+ tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &wb->b_dirty);
}
/*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode)
{
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+ list_move(&inode->i_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
* Queue all expired dirty inodes for io, eldest first.
*/
-static void queue_io(struct super_block *sb,
- unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- list_splice_init(&sb->s_more_io, sb->s_io.prev);
- move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+ list_splice_init(&wb->b_more_io, wb->b_io.prev);
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
{
- return !list_empty(&sb->s_dirty) ||
- !list_empty(&sb->s_io) ||
- !list_empty(&sb->s_more_io);
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+ return inode->i_sb->s_op->write_inode(inode, sync);
+ return 0;
}
-EXPORT_SYMBOL(sb_has_dirty_inodes);
/*
* Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_state & I_SYNC) {
/*
* If this inode is locked for writeback and we are not doing
- * writeback-for-data-integrity, move it to s_more_io so that
+ * writeback-for-data-integrity, move it to b_more_io so that
* writeback can proceed with the other inodes on s_io.
*
* We'll have another go at writing back this inode when we
- * completed a full scan of s_io.
+ * completed a full scan of b_io.
*/
if (!wait) {
requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
- * the inode; Move it from s_io onto s_more_io/s_dirty.
+ * the inode; Move it from b_io onto b_more_io/b_dirty.
*/
/*
* akpm: if the caller was the kupdate function we put
- * this inode at the head of s_dirty so it gets first
+ * this inode at the head of b_dirty so it gets first
* consideration. Otherwise, move it to the tail, for
* the reasons described there. I'm not really sure
* how much sense this makes. Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->for_kupdate) {
/*
* For the kupdate function we move the inode
- * to s_more_io so it will get more writeout as
+ * to b_more_io so it will get more writeout as
* soon as the queue becomes uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
*
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems. But
- * how to fix? We need to go from an address_space to all inodes which share
- * a queue with that address_space. (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io. They are moved back onto
- * sb->s_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
+ * 1 if we failed.
*/
-void generic_sync_sb_inodes(struct super_block *sb,
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Caller must already hold the ref for this
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ return 0;
+ }
+
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ if (down_read_trylock(&sb->s_umount)) {
+ if (sb->s_root) {
+ spin_unlock(&sb_lock);
+ return 0;
+ }
+ /*
+ * umounted, drop rwsem again and fall through to failure
+ */
+ up_read(&sb->s_umount);
+ }
+
+ sb->s_count--;
+ spin_unlock(&sb_lock);
+ return 1;
+}
+
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return;
+
+ up_read(&sb->s_umount);
+ put_super(sb);
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
+ struct super_block *sb = wbc->sb;
+ const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
- int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&sb->s_io))
- queue_io(sb, wbc->older_than_this);
- while (!list_empty(&sb->s_io)) {
- struct inode *inode = list_entry(sb->s_io.prev,
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ queue_io(wb, wbc->older_than_this);
+
+ while (!list_empty(&wb->b_io)) {
+ struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
- struct address_space *mapping = inode->i_mapping;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
- if (!bdi_cap_writeback_dirty(bdi)) {
+ /*
+ * super block given and doesn't match, skip this inode
+ */
+ if (sb && sb != inode->i_sb) {
+ redirty_tail(inode);
+ continue;
+ }
+
+ if (!bdi_cap_writeback_dirty(wb->bdi)) {
redirty_tail(inode);
- if (sb_is_blkdev_sb(sb)) {
+ if (is_blkdev_sb) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
continue;
}
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
wbc->encountered_congestion = 1;
- if (!sb_is_blkdev_sb(sb))
+ if (!is_blkdev_sb)
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
- if (wbc->bdi && bdi != wbc->bdi) {
- if (!sb_is_blkdev_sb(sb))
- break; /* fs has the wrong queue */
- requeue_io(inode);
- continue; /* blockdev has wrong queue */
- }
-
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
if (inode_dirtied_after(inode, start))
break;
- /* Is another pdflush already flushing this queue? */
- if (current_is_pdflush() && !writeback_acquire(bdi))
- break;
+ if (pin_sb_for_writeback(wbc, inode)) {
+ requeue_io(inode);
+ continue;
+ }
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- if (current_is_pdflush())
- writeback_release(bdi);
+ unpin_sb_for_writeback(wbc, inode);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
wbc->more_io = 1;
break;
}
- if (!list_empty(&sb->s_more_io))
+ if (!list_empty(&wb->b_more_io))
wbc->more_io = 1;
}
- if (sync) {
- struct inode *inode, *old_inode = NULL;
+ spin_unlock(&inode_lock);
+ /* Leave any unwritten inodes on b_io */
+}
+
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = wbc->bdi;
+ writeback_inodes_wb(&bdi->wb, wbc);
+}
+
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
+static inline bool over_bground_thresh(void)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+ return (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+ struct wb_writeback_args *args)
+{
+ struct writeback_control wbc = {
+ .bdi = wb->bdi,
+ .sb = args->sb,
+ .sync_mode = args->sync_mode,
+ .older_than_this = NULL,
+ .for_kupdate = args->for_kupdate,
+ .range_cyclic = args->range_cyclic,
+ };
+ unsigned long oldest_jif;
+ long wrote = 0;
+
+ if (wbc.for_kupdate) {
+ wbc.older_than_this = &oldest_jif;
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ }
+ if (!wbc.range_cyclic) {
+ wbc.range_start = 0;
+ wbc.range_end = LLONG_MAX;
+ }
+
+ for (;;) {
/*
- * Data integrity sync. Must wait for all pages under writeback,
- * because there may have been pages dirtied before our sync
- * call, but which had writeout started before we write it out.
- * In which case, the inode may not be on the dirty list, but
- * we still have to wait for that writeout.
+ * Don't flush anything for non-integrity writeback where
+ * no nr_pages was given
*/
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- struct address_space *mapping;
+ if (!args->for_kupdate && args->nr_pages <= 0 &&
+ args->sync_mode == WB_SYNC_NONE)
+ break;
- if (inode->i_state &
- (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
- continue;
- mapping = inode->i_mapping;
- if (mapping->nrpages == 0)
+ /*
+ * If no specific pages were given and this is just a
+ * periodic background writeout and we are below the
+ * background dirty threshold, don't do anything
+ */
+ if (args->for_kupdate && args->nr_pages <= 0 &&
+ !over_bground_thresh())
+ break;
+
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.pages_skipped = 0;
+ writeback_inodes_wb(wb, &wbc);
+ args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+
+ /*
+ * If we ran out of stuff to write, bail unless more_io got set
+ */
+ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+ if (wbc.more_io && !wbc.for_kupdate)
continue;
- __iget(inode);
- spin_unlock(&inode_lock);
- /*
- * We hold a reference to 'inode' so it couldn't have
- * been removed from s_inodes list while we dropped the
- * inode_lock. We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it
- * under inode_lock. So we keep the reference and iput
- * it later.
- */
- iput(old_inode);
- old_inode = inode;
+ break;
+ }
+ }
+
+ return wrote;
+}
+
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct bdi_work *work, *ret = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(work, &bdi->work_list, list) {
+ if (!test_bit(wb->nr, &work->seen))
+ continue;
+ clear_bit(wb->nr, &work->seen);
+
+ ret = work;
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+ unsigned long expired;
+ long nr_pages;
+
+ expired = wb->last_old_flush +
+ msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (time_before(jiffies, expired))
+ return 0;
+
+ wb->last_old_flush = jiffies;
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ if (nr_pages) {
+ struct wb_writeback_args args = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .for_kupdate = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &args);
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+ struct backing_dev_info *bdi = wb->bdi;
+ struct bdi_work *work;
+ long wrote = 0;
- filemap_fdatawait(mapping);
+ while ((work = get_next_work_item(bdi, wb)) != NULL) {
+ struct wb_writeback_args args = work->args;
- cond_resched();
+ /*
+ * Override sync mode, in case we must wait for completion
+ */
+ if (force_wait)
+ work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
- spin_lock(&inode_lock);
+ /*
+ * If this isn't a data integrity operation, just notify
+ * that we have seen this work and we are now starting it.
+ */
+ if (args.sync_mode == WB_SYNC_NONE)
+ wb_clear_pending(wb, work);
+
+ wrote += wb_writeback(wb, &args);
+
+ /*
+ * This is a data integrity writeback, so only do the
+ * notification when we have completed the work.
+ */
+ if (args.sync_mode == WB_SYNC_ALL)
+ wb_clear_pending(wb, work);
+ }
+
+ /*
+ * Check for periodic writeback, kupdated() style
+ */
+ wrote += wb_check_old_data_flush(wb);
+
+ return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+ unsigned long last_active = jiffies;
+ unsigned long wait_jiffies = -1UL;
+ long pages_written;
+
+ while (!kthread_should_stop()) {
+ pages_written = wb_do_writeback(wb, 0);
+
+ if (pages_written)
+ last_active = jiffies;
+ else if (wait_jiffies != -1UL) {
+ unsigned long max_idle;
+
+ /*
+ * Longest period of inactivity that we tolerate. If we
+ * see dirty data again later, the task will get
+ * recreated automatically.
+ */
+ max_idle = max(5UL * 60 * HZ, wait_jiffies);
+ if (time_after(jiffies, max_idle + last_active))
+ break;
}
- spin_unlock(&inode_lock);
- iput(old_inode);
- } else
- spin_unlock(&inode_lock);
- return; /* Leave any unwritten inodes on s_io */
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ try_to_freeze();
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-static void sync_sb_inodes(struct super_block *sb,
- struct writeback_control *wbc)
+/*
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
+ */
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
{
- generic_sync_sb_inodes(sb, wbc);
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ };
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_alloc_queue_work(bdi, &args);
+ }
+
+ rcu_read_unlock();
}
/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ bdi_writeback_all(NULL, nr_pages);
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+ if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+ struct dentry *dentry;
+ const char *name = "?";
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ spin_lock(&dentry->d_lock);
+ name = (const char *) dentry->d_name.name;
+ }
+ printk(KERN_DEBUG
+ "%s(%d): dirtied inode %lu (%s) on %s\n",
+ current->comm, task_pid_nr(current), inode->i_ino,
+ name, inode->i_sb->s_id);
+ if (dentry) {
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
+ }
+}
+
+/**
+ * __mark_inode_dirty - internal function
+ * @inode: inode to mark
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
+ * mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
*
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
*
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
*
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones. One group will be the dirty
- * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
- * super-efficient but we're about to do a ton of I/O...
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
+ *
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages. This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
*/
-void
-writeback_inodes(struct writeback_control *wbc)
+void __mark_inode_dirty(struct inode *inode, int flags)
{
- struct super_block *sb;
+ struct super_block *sb = inode->i_sb;
- might_sleep();
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry_reverse(sb, &super_blocks, s_list) {
- if (sb_has_dirty_inodes(sb)) {
- /* we're making our own get_super here */
- sb->s_count++;
- spin_unlock(&sb_lock);
- /*
- * If we can't get the readlock, there's no sense in
- * waiting around, most of the time the FS is going to
- * be unmounted by the time it is released.
- */
- if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
- sync_sb_inodes(sb, wbc);
- up_read(&sb->s_umount);
+ /*
+ * Don't do this for I_DIRTY_PAGES - that doesn't actually
+ * dirty the inode itself
+ */
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (sb->s_op->dirty_inode)
+ sb->s_op->dirty_inode(inode);
+ }
+
+ /*
+ * make sure that changes are seen by all cpus before we test i_state
+ * -- mikulas
+ */
+ smp_mb();
+
+ /* avoid the locking if we can */
+ if ((inode->i_state & flags) == flags)
+ return;
+
+ if (unlikely(block_dump))
+ block_dump___mark_inode_dirty(inode);
+
+ spin_lock(&inode_lock);
+ if ((inode->i_state & flags) != flags) {
+ const int was_dirty = inode->i_state & I_DIRTY;
+
+ inode->i_state |= flags;
+
+ /*
+ * If the inode is being synced, just update its dirty state.
+ * The unlocker will place the inode on the appropriate
+ * superblock list, based upon its state.
+ */
+ if (inode->i_state & I_SYNC)
+ goto out;
+
+ /*
+ * Only add valid (hashed) inodes to the superblock's
+ * dirty list. Add blockdev inodes as well.
+ */
+ if (!S_ISBLK(inode->i_mode)) {
+ if (hlist_unhashed(&inode->i_hash))
+ goto out;
+ }
+ if (inode->i_state & (I_FREEING|I_CLEAR))
+ goto out;
+
+ /*
+ * If the inode was already on b_dirty/b_io/b_more_io, don't
+ * reposition it (that would break b_dirty time-ordering).
+ */
+ if (!was_dirty) {
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ struct backing_dev_info *bdi = wb->bdi;
+
+ if (bdi_cap_writeback_dirty(bdi) &&
+ !test_bit(BDI_registered, &bdi->state)) {
+ WARN_ON(1);
+ printk(KERN_ERR "bdi-%s not registered\n",
+ bdi->name);
}
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
+
+ inode->dirtied_when = jiffies;
+ list_move(&inode->i_list, &wb->b_dirty);
}
- if (wbc->nr_to_write <= 0)
- break;
}
- spin_unlock(&sb_lock);
+out:
+ spin_unlock(&inode_lock);
}
+EXPORT_SYMBOL(__mark_inode_dirty);
/*
- * writeback and wait upon the filesystem's dirty inodes. The caller will
- * do this in two passes - one to write, and one to wait.
+ * Write out a superblock's list of dirty inodes. A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
*
- * A finite limit is set on the number of pages which will be written.
- * To prevent infinite livelock of sys_sync().
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched. For other superblocks,
+ * assume that all inodes are backed by the same queue.
*
- * We add in the number of potentially dirty inodes, because each inode write
- * can dirty pagecache in the underlying blockdev.
+ * The inodes to be written are parked on bdi->b_io. They are moved back onto
+ * bdi->b_dirty as they are selected for writing. This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-void sync_inodes_sb(struct super_block *sb, int wait)
+static void wait_sb_inodes(struct super_block *sb)
{
- struct writeback_control wbc = {
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ struct inode *inode, *old_inode = NULL;
+
+ /*
+ * We need to be protected against the filesystem going from
+ * r/o to r/w or vice versa.
+ */
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ spin_lock(&inode_lock);
+
+ /*
+ * Data integrity sync. Must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync
+ * call, but which had writeout started before we write it out.
+ * In which case, the inode may not be on the dirty list, but
+ * we still have to wait for that writeout.
+ */
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct address_space *mapping;
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+ continue;
+ mapping = inode->i_mapping;
+ if (mapping->nrpages == 0)
+ continue;
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have
+ * been removed from s_inodes list while we dropped the
+ * inode_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it
+ * under inode_lock. So we keep the reference and iput
+ * it later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ filemap_fdatawait(mapping);
+
+ cond_resched();
- if (!wait) {
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ spin_lock(&inode_lock);
+ }
+ spin_unlock(&inode_lock);
+ iput(old_inode);
+}
- wbc.nr_to_write = nr_dirty + nr_unstable +
+/**
+ * writeback_inodes_sb - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ long nr_to_write;
+
+ nr_to_write = nr_dirty + nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- } else
- wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
- sync_sb_inodes(sb, &wbc);
+ bdi_writeback_all(sb, nr_to_write);
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb - sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+ bdi_sync_writeback(sb->s_bdi, sb);
+ wait_sb_inodes(sb);
}
+EXPORT_SYMBOL(sync_inodes_sb);
/**
* write_inode_now - write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what: what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- * OSYNC_DATA: i_mapping's dirty data
- * OSYNC_METADATA: the buffers at i_mapping->private_list
- * OSYNC_INODE: the inode itself
- */
-
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
- int err = 0;
- int need_write_inode_now = 0;
- int err2;
-
- if (what & OSYNC_DATA)
- err = filemap_fdatawrite(mapping);
- if (what & (OSYNC_METADATA|OSYNC_DATA)) {
- err2 = sync_mapping_buffers(mapping);
- if (!err)
- err = err2;
- }
- if (what & OSYNC_DATA) {
- err2 = filemap_fdatawait(mapping);
- if (!err)
- err = err2;
- }
-
- spin_lock(&inode_lock);
- if ((inode->i_state & I_DIRTY) &&
- ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
- need_write_inode_now = 1;
- spin_unlock(&inode_lock);
-
- if (need_write_inode_now) {
- err2 = write_inode_now(inode, 1);
- if (!err)
- err = err2;
- }
- else
- inode_sync_wait(inode);
-
- return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb037..3773fd63d2f 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
return simple_read_from_buffer(buf, len, ppos, tmp, size);
}
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ char tmp[32];
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos || count >= sizeof(tmp) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(tmp, buf, count))
+ return -EINVAL;
+
+ tmp[count] = '\0';
+
+ err = strict_strtoul(tmp, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->max_background;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->max_background = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->congestion_threshold;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->congestion_threshold = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
.read = fuse_conn_waiting_read,
};
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+};
+
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
goto err;
if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
- NULL, &fuse_ctl_waiting_ops) ||
+ NULL, &fuse_ctl_waiting_ops) ||
!fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
- NULL, &fuse_ctl_abort_ops))
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
goto err;
return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
d_drop(dentry);
dput(dentry);
}
- fuse_control_sb->s_root->d_inode->i_nlink--;
+ drop_nlink(fuse_control_sb->s_root->d_inode);
}
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd..51d9e33d634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
static void flush_bg_queue(struct fuse_conn *fc)
{
- while (fc->active_background < FUSE_MAX_BACKGROUND &&
+ while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
@@ -280,11 +280,11 @@ __releases(&fc->lock)
list_del(&req->intr_entry);
req->state = FUSE_REQ_FINISHED;
if (req->background) {
- if (fc->num_background == FUSE_MAX_BACKGROUND) {
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
{
req->background = 1;
fc->num_background++;
- if (fc->num_background == FUSE_MAX_BACKGROUND)
+ if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->bdi_initialized) {
set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0fa..fc9c79feb5f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -38,7 +32,7 @@
#define FUSE_NAME_MAX 1024
/** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
/** Global mutex protecting fuse_conn_list and the control filesystem */
extern struct mutex fuse_mutex;
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
/** Number of requests currently in the background */
unsigned num_background;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189..6da947daabd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
#define FUSE_SUPER_MAGIC 0x65735546
#define FUSE_DEFAULT_BLKSIZE 512
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
struct fuse_mount_data {
int fd;
unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
+static void sanitize_global_limit(unsigned *limit)
+{
+ if (*limit == 0)
+ *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ sizeof(struct fuse_req);
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+}
+
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
else {
unsigned long ra_pages;
+ process_init_limits(fc, arg);
+
if (arg->minor >= 6) {
ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ fc->bdi.name = "fuse";
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
/* fuse does it's own writeback accounting */
@@ -893,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_put_conn;
+ sb->s_bdi = &fc->bdi;
+
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1147,6 +1227,9 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
return 0;
err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f73..21f7e46da4c 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
EXTRA_CFLAGS := -I$(src)
obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
glops.o inode.o log.o lops.o main.o meta_io.o \
aops.o dentry.o export.o file.o \
ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d8..3fc4e3ac7d8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -31,8 +30,7 @@
#define ACL_DEFAULT 0
int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode)
+ struct gfs2_ea_request *er, int *remove, mode_t *mode)
{
struct posix_acl *acl;
int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
return 0;
}
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
- struct gfs2_ea_location *el, char **data, unsigned int *len)
+static int acl_get(struct gfs2_inode *ip, const char *name,
+ struct posix_acl **acl, struct gfs2_ea_location *el,
+ char **datap, unsigned int *lenp)
{
- struct gfs2_ea_request er;
- struct gfs2_ea_location el_this;
+ char *data;
+ unsigned int len;
int error;
+ el->el_bh = NULL;
+
if (!ip->i_eattr)
return 0;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- if (access) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- } else {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- }
- er.er_type = GFS2_EATYPE_SYS;
-
- if (!el)
- el = &el_this;
-
- error = gfs2_ea_find(ip, &er, el);
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
if (error)
return error;
if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
if (!GFS2_EA_DATA_LEN(el->el_ea))
goto out;
- er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
- er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+ len = GFS2_EA_DATA_LEN(el->el_ea);
+ data = kmalloc(len, GFP_NOFS);
error = -ENOMEM;
- if (!er.er_data)
+ if (!data)
goto out;
- error = gfs2_ea_get_copy(ip, el, er.er_data);
- if (error)
+ error = gfs2_ea_get_copy(ip, el, data, len);
+ if (error < 0)
goto out_kfree;
+ error = 0;
if (acl) {
- *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+ *acl = posix_acl_from_xattr(data, len);
if (IS_ERR(*acl))
error = PTR_ERR(*acl);
}
out_kfree:
- if (error || !data)
- kfree(er.er_data);
- else {
- *data = er.er_data;
- *len = er.er_data_len;
+ if (error || !datap) {
+ kfree(data);
+ } else {
+ *datap = data;
+ *lenp = len;
}
out:
- if (error || el == &el_this)
- brelse(el->el_bh);
return error;
}
@@ -153,10 +140,12 @@ out:
int gfs2_check_acl(struct inode *inode, int mask)
{
+ struct gfs2_ea_location el;
struct posix_acl *acl = NULL;
int error;
- error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+ error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+ brelse(el.el_bh);
if (error)
return error;
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
{
+ struct gfs2_ea_location el;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct posix_acl *acl = NULL, *clone;
- struct gfs2_ea_request er;
mode_t mode = ip->i_inode.i_mode;
+ char *data = NULL;
+ unsigned int len;
int error;
if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
if (S_ISLNK(ip->i_inode.i_mode))
return 0;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = GFS2_EATYPE_SYS;
-
- error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
- &er.er_data, &er.er_data_len);
+ error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+ brelse(el.el_bh);
if (error)
return error;
if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
acl = clone;
if (S_ISDIR(ip->i_inode.i_mode)) {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- error = gfs2_system_eaops.eo_set(ip, &er);
+ error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+ GFS2_POSIX_ACL_DEFAULT, data, len, 0);
if (error)
goto out;
}
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
error = posix_acl_create_masq(acl, &mode);
if (error < 0)
goto out;
- if (error > 0) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
- er.er_mode = mode;
- er.er_flags = GFS2_ERF_MODE;
- error = gfs2_system_eaops.eo_set(ip, &er);
- if (error)
- goto out;
- } else
- munge_mode(ip, mode);
+ if (error == 0)
+ goto munge;
+ posix_acl_to_xattr(acl, data, len);
+ error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+ GFS2_POSIX_ACL_ACCESS, data, len, 0);
+ if (error)
+ goto out;
+munge:
+ error = munge_mode(ip, mode);
out:
posix_acl_release(acl);
- kfree(er.er_data);
+ kfree(data);
return error;
}
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
unsigned int len;
int error;
- error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+ error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
if (error)
- return error;
+ goto out_brelse;
if (!acl)
return gfs2_setattr_simple(ip, attr);
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
out:
posix_acl_release(acl);
- brelse(el.el_bh);
kfree(data);
+out_brelse:
+ brelse(el.el_bh);
return error;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace..7ebae9a4ecc 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
{
struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
int alloc_required;
int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
error = gfs2_glock_nq(&ip->i_gh);
if (unlikely(error))
goto out_uninit;
+ if (&ip->i_inode == sdp->sd_rindex) {
+ error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, &m_ip->i_gh);
+ if (unlikely(error)) {
+ gfs2_glock_dq(&ip->i_gh);
+ goto out_uninit;
+ }
+ }
error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
rblocks += data_blocks ? data_blocks : 1;
if (ind_blocks || data_blocks)
rblocks += RES_STATFS + RES_QUOTA;
+ if (&ip->i_inode == sdp->sd_rindex)
+ rblocks += 2 * RES_STATFS;
error = gfs2_trans_begin(sdp, rblocks,
PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
gfs2_alloc_put(ip);
}
out_unlock:
+ if (&ip->i_inode == sdp->sd_rindex) {
+ gfs2_glock_dq(&m_ip->i_gh);
+ gfs2_holder_uninit(&m_ip->i_gh);
+ }
gfs2_glock_dq(&ip->i_gh);
out_uninit:
gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
static void adjust_fs_space(struct inode *inode)
{
struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+ struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+ struct buffer_head *m_bh, *l_bh;
u64 fs_total, new_free;
/* Total up the file system space, according to the latest rindex. */
fs_total = gfs2_ri_total(sdp);
+ if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+ return;
spin_lock(&sdp->sd_statfs_spin);
+ gfs2_statfs_change_in(m_sc, m_bh->b_data +
+ sizeof(struct gfs2_dinode));
if (fs_total > (m_sc->sc_total + l_sc->sc_total))
new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
fs_warn(sdp, "File system extended by %llu blocks.\n",
(unsigned long long)new_free);
gfs2_statfs_change(sdp, new_free, new_free, 0);
+
+ if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+ goto out;
+ update_statfs(sdp, m_bh, l_bh);
+ brelse(l_bh);
+out:
+ brelse(m_bh);
}
/**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
u64 to = pos + copied;
void *kaddr;
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
brelse(dibh);
gfs2_trans_end(sdp);
+ if (inode == sdp->sd_rindex) {
+ gfs2_glock_dq(&m_ip->i_gh);
+ gfs2_holder_uninit(&m_ip->i_gh);
+ }
gfs2_glock_dq(&ip->i_gh);
gfs2_holder_uninit(&ip->i_gh);
return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
struct inode *inode = page->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct buffer_head *dibh;
struct gfs2_alloc *al = ip->i_alloc;
unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
gfs2_quota_unlock(ip);
gfs2_alloc_put(ip);
}
+ if (inode == sdp->sd_rindex) {
+ gfs2_glock_dq(&m_ip->i_gh);
+ gfs2_holder_uninit(&m_ip->i_gh);
+ }
gfs2_glock_dq(&ip->i_gh);
gfs2_holder_uninit(&ip->i_gh);
return ret;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd560..91beddadd38 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
return 0;
}
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+ struct gfs2_inode *ginode;
+
+ if (!dentry->d_inode)
+ return 0;
+
+ ginode = GFS2_I(dentry->d_inode);
+ if (!ginode->i_iopen_gh.gh_gl)
+ return 0;
+
+ if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+ return 1;
+
+ return 0;
+}
+
const struct dentry_operations gfs2_dops = {
.d_revalidate = gfs2_drevalidate,
.d_hash = gfs2_dhash,
+ .d_delete = gfs2_dentry_delete,
};
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b3..00000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
- unsigned int type;
-
- if (strncmp(name, "system.", 7) == 0) {
- type = GFS2_EATYPE_SYS;
- if (truncated_name)
- *truncated_name = name + sizeof("system.") - 1;
- } else if (strncmp(name, "user.", 5) == 0) {
- type = GFS2_EATYPE_USR;
- if (truncated_name)
- *truncated_name = name + sizeof("user.") - 1;
- } else if (strncmp(name, "security.", 9) == 0) {
- type = GFS2_EATYPE_SECURITY;
- if (truncated_name)
- *truncated_name = name + sizeof("security.") - 1;
- } else {
- type = GFS2_EATYPE_UNUSED;
- if (truncated_name)
- *truncated_name = NULL;
- }
-
- return type;
-}
-
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
- !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
- (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
- GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
- return -EOPNOTSUPP;
-
- return gfs2_ea_get_i(ip, er);
-}
-
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- int remove = 0;
- int error;
-
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- if (!(er->er_flags & GFS2_ERF_MODE)) {
- er->er_mode = ip->i_inode.i_mode;
- er->er_flags |= GFS2_ERF_MODE;
- }
- error = gfs2_acl_validate_set(ip, 1, er,
- &remove, &er->er_mode);
- if (error)
- return error;
- error = gfs2_ea_set_i(ip, er);
- if (error)
- return error;
- if (remove)
- gfs2_ea_remove_i(ip, er);
- return 0;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- error = gfs2_acl_validate_set(ip, 0, er,
- &remove, NULL);
- if (error)
- return error;
- if (!remove)
- error = gfs2_ea_set_i(ip, er);
- else {
- error = gfs2_ea_remove_i(ip, er);
- if (error == -ENODATA)
- error = 0;
- }
- return error;
- }
-
- return -EPERM;
-}
-
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 1);
- if (error)
- return error;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 0);
- if (error)
- return error;
-
- } else
- return -EPERM;
-
- return gfs2_ea_remove_i(ip, er);
-}
-
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "user",
-};
-
-const struct gfs2_eattr_operations gfs2_system_eaops = {
- .eo_get = system_eo_get,
- .eo_set = system_eo_set,
- .eo_remove = system_eo_remove,
- .eo_name = "system",
-};
-
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "security",
-};
-
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
- NULL,
- &gfs2_user_eaops,
- &gfs2_system_eaops,
- &gfs2_security_eaops,
-};
-
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40..00000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-
-struct gfs2_ea_request;
-struct gfs2_inode;
-
-struct gfs2_eattr_operations {
- int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- char *eo_name;
-};
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-
-#endif /* __EAOPS_DOT_H__ */
-
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef22171..d15876e9aa2 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
}
static struct dentry *gfs2_get_dentry(struct super_block *sb,
- struct gfs2_inum_host *inum)
+ struct gfs2_inum_host *inum)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- struct gfs2_holder i_gh, ri_gh, rgd_gh;
- struct gfs2_rgrpd *rgd;
+ struct gfs2_holder i_gh;
struct inode *inode;
struct dentry *dentry;
int error;
- /* System files? */
-
inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
if (error)
return ERR_PTR(error);
- error = gfs2_rindex_hold(sdp, &ri_gh);
+ error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
if (error)
goto fail;
- error = -EINVAL;
- rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
- if (!rgd)
- goto fail_rindex;
-
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
- if (error)
- goto fail_rindex;
-
- error = -ESTALE;
- if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
- goto fail_rgd;
-
- gfs2_glock_dq_uninit(&rgd_gh);
- gfs2_glock_dq_uninit(&ri_gh);
-
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
- inum->no_addr,
- 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto fail;
@@ -224,13 +203,6 @@ out_inode:
if (!IS_ERR(dentry))
dentry->d_op = &gfs2_dops;
return dentry;
-
-fail_rgd:
- gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
- gfs2_glock_dq_uninit(&ri_gh);
-
fail:
gfs2_glock_dq_uninit(&i_gh);
return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f..166f38fbd24 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
#include "rgrp.h"
#include "trans.h"
#include "util.h"
-#include "eaops.h"
/**
* gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427..8b674b1f3a5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
static DECLARE_RWSEM(gfs2_umount_flush_sem);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
static LIST_HEAD(lru_list);
static atomic_t lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(lru_lock);
@@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
*
*/
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
atomic_inc(&gl->gl_ref);
}
/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(const struct gfs2_glock *gl)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (gl->gl_state == LM_ST_UNLOCKED)
+ return 0;
+ if (!list_empty(&gl->gl_holders))
+ return 0;
+ if (glops->go_demote_ok)
+ return glops->go_demote_ok(gl);
+ return 1;
+}
+
+/**
* gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
* @gl: the glock
*
@@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
{
+ int may_reclaim;
+ may_reclaim = (demote_ok(gl) &&
+ (atomic_read(&gl->gl_ref) == 1 ||
+ (gl->gl_name.ln_type == LM_TYPE_INODE &&
+ atomic_read(&gl->gl_ref) <= 2)));
spin_lock(&lru_lock);
- if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+ if (list_empty(&gl->gl_lru) && may_reclaim) {
list_add_tail(&gl->gl_lru, &lru_list);
atomic_inc(&lru_count);
}
@@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
}
/**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+ if (atomic_dec_and_test(&gl->gl_ref))
+ GLOCK_BUG_ON(gl, 1);
+ gfs2_glock_schedule_for_reclaim(gl);
+}
+
+/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
*
@@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
rv = 1;
goto out;
}
- /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
- if (atomic_read(&gl->gl_ref) == 2)
- gfs2_glock_schedule_for_reclaim(gl);
+ spin_lock(&gl->gl_spin);
+ gfs2_glock_schedule_for_reclaim(gl);
+ spin_unlock(&gl->gl_spin);
write_unlock(gl_lock_addr(gl->gl_hash));
out:
return rv;
@@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
if (held2)
gfs2_glock_hold(gl);
else
- gfs2_glock_put(gl);
+ gfs2_glock_put_nolock(gl);
}
gl->gl_state = new_state;
@@ -633,12 +674,35 @@ out:
out_sched:
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
+ gfs2_glock_put_nolock(gl);
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
goto out;
}
+static void delete_work_func(struct work_struct *work)
+{
+ struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_inode *ip = NULL;
+ struct inode *inode;
+ u64 no_addr = 0;
+
+ spin_lock(&gl->gl_spin);
+ ip = (struct gfs2_inode *)gl->gl_object;
+ if (ip)
+ no_addr = ip->i_no_addr;
+ spin_unlock(&gl->gl_spin);
+ if (ip) {
+ inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+ if (inode) {
+ d_prune_aliases(inode);
+ iput(inode);
+ }
+ }
+ gfs2_glock_put(gl);
+}
+
static void glock_work_func(struct work_struct *work)
{
unsigned long delay = 0;
@@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_sbd = sdp;
gl->gl_aspace = NULL;
INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+ INIT_WORK(&gl->gl_delete, delete_work_func);
/* If this glock protects actual on-disk data or metadata blocks,
create a VFS inode to manage the pages/buffers holding them. */
@@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
gl->gl_demote_state != state) {
gl->gl_demote_state = LM_ST_UNLOCKED;
}
+ if (gl->gl_ops->go_callback)
+ gl->gl_ops->go_callback(gl);
trace_gfs2_demote_rq(gl);
}
@@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
gfs2_glock_put(gl);
}
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int demote_ok(const struct gfs2_glock *gl)
-{
- const struct gfs2_glock_operations *glops = gl->gl_ops;
-
- if (gl->gl_state == LM_ST_UNLOCKED)
- return 0;
- if (!list_empty(&gl->gl_holders))
- return 0;
- if (glops->go_demote_ok)
- return glops->go_demote_ok(gl);
- return 1;
-}
-
static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
{
struct gfs2_glock *gl;
int may_demote;
int nr_skipped = 0;
- int got_ref = 0;
LIST_HEAD(skipped);
if (nr == 0)
@@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
+ /* Check if glock is about to be freed */
+ if (atomic_read(&gl->gl_ref) == 0)
+ continue;
+
/* Test for being demotable */
if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
gfs2_glock_hold(gl);
- got_ref = 1;
spin_unlock(&lru_lock);
spin_lock(&gl->gl_spin);
may_demote = demote_ok(gl);
- spin_unlock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
if (may_demote) {
handle_callback(gl, LM_ST_UNLOCKED, 0);
nr--;
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
- got_ref = 0;
}
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put_nolock(gl);
+ spin_unlock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
spin_lock(&lru_lock);
- if (may_demote)
- continue;
- }
- if (list_empty(&gl->gl_lru) &&
- (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
- nr_skipped++;
- list_add(&gl->gl_lru, &skipped);
- }
- if (got_ref) {
- spin_unlock(&lru_lock);
- gfs2_glock_put(gl);
- spin_lock(&lru_lock);
- got_ref = 0;
+ continue;
}
+ nr_skipped++;
+ list_add(&gl->gl_lru, &skipped);
}
list_splice(&skipped, &lru_list);
atomic_add(nr_skipped, &lru_count);
@@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void)
glock_workqueue = create_workqueue("glock_workqueue");
if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
+ gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+ if (IS_ERR(gfs2_delete_workqueue)) {
+ destroy_workqueue(glock_workqueue);
+ return PTR_ERR(gfs2_delete_workqueue);
+ }
register_shrinker(&glock_shrinker);
@@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void)
{
unregister_shrinker(&glock_shrinker);
destroy_workqueue(glock_workqueue);
+ destroy_workqueue(gfs2_delete_workqueue);
}
static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f0..c609894ec0d 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
#define GLR_TRYFAILED 13
+extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
int gfs2_glock_get(struct gfs2_sbd *sdp,
u64 number, const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
int gfs2_glock_put(struct gfs2_glock *gl);
void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca..6985eef06c3 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
if (gl->gl_state != LM_ST_UNLOCKED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ flush_workqueue(gfs2_delete_workqueue);
gfs2_meta_syncfs(sdp);
gfs2_log_shutdown(sdp);
}
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
return 0;
}
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+
+ if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+ gl->gl_state == LM_ST_SHARED &&
+ ip && test_bit(GIF_USER, &ip->i_flags)) {
+ gfs2_glock_hold(gl);
+ if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+ gfs2_glock_put_nolock(gl);
+ }
+}
+
const struct gfs2_glock_operations gfs2_meta_glops = {
.go_type = LM_TYPE_META,
};
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
+ .go_callback = iopen_go_callback,
};
const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3..6edb423f90b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+ void (*go_callback) (struct gfs2_glock *gl);
const int go_type;
const unsigned long go_min_hold_time;
};
@@ -228,6 +229,7 @@ struct gfs2_glock {
struct list_head gl_ail_list;
atomic_t gl_ail_count;
struct delayed_work gl_work;
+ struct work_struct gl_delete;
};
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -404,6 +406,12 @@ struct gfs2_statfs_change_host {
#define GFS2_DATA_WRITEBACK 1
#define GFS2_DATA_ORDERED 2
+#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW 0
+#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
+#define GFS2_ERRORS_RO 2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC 3
+
struct gfs2_args {
char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -420,6 +428,7 @@ struct gfs2_args {
unsigned int ar_data:2; /* ordered/writeback */
unsigned int ar_meta:1; /* mount metafs */
unsigned int ar_discard:1; /* discard requests */
+ unsigned int ar_errors:2; /* errors=withdraw | panic */
int ar_commit; /* Commit interval */
};
@@ -487,7 +496,6 @@ struct gfs2_sb_host {
*/
struct lm_lockstruct {
- u32 ls_id;
unsigned int ls_jid;
unsigned int ls_first;
unsigned int ls_first_done;
@@ -539,18 +547,12 @@ struct gfs2_sbd {
struct dentry *sd_root_dir;
struct inode *sd_jindex;
- struct inode *sd_inum_inode;
struct inode *sd_statfs_inode;
- struct inode *sd_ir_inode;
struct inode *sd_sc_inode;
struct inode *sd_qc_inode;
struct inode *sd_rindex;
struct inode *sd_quota_inode;
- /* Inum stuff */
-
- struct mutex sd_inum_mutex;
-
/* StatFS stuff */
spinlock_t sd_statfs_spin;
@@ -578,7 +580,6 @@ struct gfs2_sbd {
struct gfs2_holder sd_journal_gh;
struct gfs2_holder sd_jinode_gh;
- struct gfs2_holder sd_ir_gh;
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd72369..fb15d3b1f40 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
@@ -519,139 +519,6 @@ out:
return inode ? inode : ERR_PTR(error);
}
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
- const struct gfs2_inum_range *str = buf;
-
- ir->ir_start = be64_to_cpu(str->ir_start);
- ir->ir_length = be64_to_cpu(str->ir_length);
-}
-
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
- struct gfs2_inum_range *str = buf;
-
- str->ir_start = cpu_to_be64(ir->ir_start);
- str->ir_length = cpu_to_be64(ir->ir_length);
-}
-
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error) {
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return error;
- }
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (ir.ir_length) {
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir,
- bh->b_data + sizeof(struct gfs2_dinode));
- brelse(bh);
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return 0;
- }
-
- brelse(bh);
-
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-
- return 1;
-}
-
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
- struct gfs2_holder gh;
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (error)
- return error;
-
- error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
- if (error)
- goto out;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- goto out_end_trans;
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (!ir.ir_length) {
- struct buffer_head *m_bh;
- u64 x, y;
- __be64 z;
-
- error = gfs2_meta_inode_buffer(m_ip, &m_bh);
- if (error)
- goto out_brelse;
-
- z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
- x = y = be64_to_cpu(z);
- ir.ir_start = x;
- ir.ir_length = GFS2_INUM_QUANTUM;
- x += GFS2_INUM_QUANTUM;
- if (x < y)
- gfs2_consist_inode(m_ip);
- z = cpu_to_be64(x);
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
- *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-
- brelse(m_bh);
- }
-
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
-
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-out_brelse:
- brelse(bh);
-out_end_trans:
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-out:
- gfs2_glock_dq_uninit(&gh);
- return error;
-}
-
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
- int error;
-
- error = pick_formal_ino_1(sdp, inum);
- if (error <= 0)
- return error;
-
- error = pick_formal_ino_2(sdp, inum);
-
- return error;
-}
-
/**
* create_ok - OK to create a new on-disk inode here?
* @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
if (error)
goto out_ipreserv;
- *no_addr = gfs2_alloc_di(dip, generation);
+ error = gfs2_alloc_di(dip, no_addr, generation);
gfs2_trans_end(sdp);
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
size_t len;
void *value;
char *name;
- struct gfs2_ea_request er;
err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
&name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
return err;
}
- memset(&er, 0, sizeof(struct gfs2_ea_request));
-
- er.er_type = GFS2_EATYPE_SECURITY;
- er.er_name = name;
- er.er_data = value;
- er.er_name_len = strlen(name);
- er.er_data_len = len;
-
- err = gfs2_ea_set_i(ip, &er);
-
+ err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
kfree(value);
kfree(name);
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock;
- error = pick_formal_ino(sdp, &inum.no_formal_ino);
- if (error)
- goto fail_gunlock;
-
error = alloc_dinode(dip, &inum.no_addr, &generation);
if (error)
goto fail_gunlock;
+ inum.no_formal_ino = generation;
error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
- inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
- inum.no_addr,
- inum.no_formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+ inum.no_formal_ino, 0);
if (IS_ERR(inode))
goto fail_gunlock2;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd67..52fb6c04898 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
gfs2_tune_init(&sdp->sd_tune);
- mutex_init(&sdp->sd_inum_mutex);
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
if (error)
goto fail;
- /* Read in the master inode number inode */
- sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
- if (IS_ERR(sdp->sd_inum_inode)) {
- error = PTR_ERR(sdp->sd_inum_inode);
- fs_err(sdp, "can't read in inum inode: %d\n", error);
- goto fail_journal;
- }
-
-
/* Read in the master statfs inode */
sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail_inum;
+ goto fail_journal;
}
/* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
iput(sdp->sd_rindex);
fail_statfs:
iput(sdp->sd_statfs_inode);
-fail_inum:
- iput(sdp->sd_inum_inode);
fail_journal:
init_journal(sdp, UNDO);
fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
return error;
}
- sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
- if (IS_ERR(sdp->sd_ir_inode)) {
- error = PTR_ERR(sdp->sd_ir_inode);
- fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
- goto fail;
- }
-
sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
if (IS_ERR(sdp->sd_sc_inode)) {
error = PTR_ERR(sdp->sd_sc_inode);
fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
- goto fail_ir_i;
+ goto fail;
}
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
iput(pn);
pn = NULL;
- ip = GFS2_I(sdp->sd_ir_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
- &sdp->sd_ir_gh);
- if (error) {
- fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
- goto fail_qc_i;
- }
-
ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_sc_gh);
if (error) {
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
- goto fail_ir_gh;
+ goto fail_qc_i;
}
ip = GFS2_I(sdp->sd_qc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
fail_ut_gh:
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
fail_qc_i:
iput(sdp->sd_qc_inode);
fail_ut_i:
iput(sdp->sd_sc_inode);
-fail_ir_i:
- iput(sdp->sd_ir_inode);
fail:
if (pn)
iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_ops = lm;
ls->ls_first = 1;
- ls->ls_id = 0;
for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_jid = option;
break;
case Opt_id:
- ret = match_int(&tmp[0], &option);
- if (ret)
- goto hostdata_error;
- ls->ls_id = option;
+ /* Obsolete, but left for backward compat purposes */
break;
case Opt_first:
ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
lm->lm_unmount(sdp);
}
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
/**
* fill_super - Read in superblock
* @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
sdp->sd_args.ar_commit = 60;
+ sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
error = gfs2_mount_args(sdp, &sdp->sd_args, data);
if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_export_op = &gfs2_export_ops;
+ sb->s_xattr = gfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
}
gfs2_glock_dq_uninit(&mount_gh);
-
+ gfs2_online_uevent(sdp);
return 0;
fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99..c3ac1805405 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
if (error)
- goto out_rgrp;
+ goto out_gunlock;
error = gfs2_dir_del(dip, &dentry->d_name);
if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
const void *data, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = (char *)data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
- er.er_flags = flags;
-
- gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_set(GFS2_I(inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_setxattr(dentry, name, data, size, flags);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
void *data, size_t size)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
-
- return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_data = (size) ? buffer : NULL;
- er.er_data_len = size;
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_getxattr(dentry, name, data, size);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_removexattr(struct dentry *dentry, const char *name)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_name_len = strlen(er.er_name);
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_removexattr(dentry, name);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a2..28c590b7c9d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
- if (count[1] + count[2] != tmp) {
+ if (count[1] != tmp) {
if (gfs2_consist_rgrpd(rgd))
fs_err(sdp, "used data mismatch: %u != %u\n",
count[1], tmp);
return;
}
- if (count[3] != rgd->rd_dinodes) {
+ if (count[2] + count[3] != rgd->rd_dinodes) {
if (gfs2_consist_rgrpd(rgd))
fs_err(sdp, "used metadata mismatch: %u != %u\n",
- count[3], rgd->rd_dinodes);
+ count[2] + count[3], rgd->rd_dinodes);
return;
}
-
- if (count[2] > count[3]) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "unlinked inodes > inodes: %u\n",
- count[2]);
- return;
- }
-
}
static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -865,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
goto start_new_extent;
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
- nr_sects, GFP_NOFS);
+ nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
nr_sects = 0;
@@ -879,7 +872,8 @@ start_new_extent:
}
}
if (nr_sects) {
- rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+ rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
}
@@ -961,7 +955,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
* Returns: The inode, if one has been found
*/
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+ u64 skip)
{
struct inode *inode;
u32 goal = 0, block;
@@ -985,6 +980,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
goal++;
if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
continue;
+ if (no_addr == skip)
+ continue;
*last_unlinked = no_addr;
inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
no_addr, -1, 1);
@@ -1104,7 +1101,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
if (try_rgrp_fit(rgd, al))
goto out;
if (rgd->rd_flags & GFS2_RDF_CHECK)
- inode = try_rgrp_unlink(rgd, last_unlinked);
+ inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
@@ -1138,7 +1135,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
if (try_rgrp_fit(rgd, al))
goto out;
if (rgd->rd_flags & GFS2_RDF_CHECK)
- inode = try_rgrp_unlink(rgd, last_unlinked);
+ inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
@@ -1261,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
* Returns: The block type (GFS2_BLKST_*)
*/
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
{
struct gfs2_bitmap *bi = NULL;
u32 length, rgrp_block, buf_block;
@@ -1464,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
return 0;
}
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+ (unsigned long long)rgd->rd_addr);
+ fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+ gfs2_rgrp_dump(NULL, rgd->rd_gl);
+ rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
/**
* gfs2_alloc_block - Allocate one or more blocks
* @ip: the inode to allocate the block for
@@ -1525,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
return 0;
rgrp_error:
- fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
- (unsigned long long)rgd->rd_addr);
- fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
- gfs2_rgrp_dump(NULL, rgd->rd_gl);
- rgd->rd_flags |= GFS2_RDF_ERROR;
+ gfs2_rgrp_error(rgd);
return -EIO;
}
/**
* gfs2_alloc_di - Allocate a dinode
* @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
*
- * Returns: the block allocated
+ * Returns: 0 on success or error
*/
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_alloc *al = dip->i_alloc;
@@ -1551,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
blk = rgblk_search(rgd, rgd->rd_last_alloc,
GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
- BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc = blk;
+ /* Since all blocks are reserved in advance, this shouldn't happen */
+ if (blk == BFITNOENT)
+ goto rgrp_error;
+ rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
+ if (rgd->rd_free == 0)
+ goto rgrp_error;
- gfs2_assert_withdraw(sdp, rgd->rd_free);
rgd->rd_free--;
rgd->rd_dinodes++;
*generation = rgd->rd_igeneration++;
+ if (*generation == 0)
+ *generation = rgd->rd_igeneration++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1573,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
rgd->rd_free_clone--;
spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
- return block;
+ *bn = block;
+ return 0;
+
+rgrp_error:
+ gfs2_rgrp_error(rgd);
+ return -EIO;
}
/**
@@ -1681,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
}
/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ * -ESTALE if it doesn't match
+ * or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder ri_gh, rgd_gh;
+ int error;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto fail;
+
+ error = -EINVAL;
+ rgd = gfs2_blk2rgrpd(sdp, no_addr);
+ if (!rgd)
+ goto fail_rindex;
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+ if (error)
+ goto fail_rindex;
+
+ if (gfs2_get_block_type(rgd, no_addr) != type)
+ error = -ESTALE;
+
+ gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+ gfs2_glock_dq_uninit(&ri_gh);
+fail:
+ return error;
+}
+
+/**
* gfs2_rlist_add - add a RG to a list of RGs
* @sdp: the filesystem
* @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e0..b4106ddaaa9 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+ unsigned int type);
struct gfs2_rgrp_list {
unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a680133647..0ec3ec672de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
#include "trans.h"
#include "util.h"
#include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
@@ -68,6 +68,8 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_commit,
+ Opt_err_withdraw,
+ Opt_err_panic,
Opt_error,
};
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_commit, "commit=%d"},
+ {Opt_err_withdraw, "errors=withdraw"},
+ {Opt_err_panic, "errors=panic"},
{Opt_error, NULL}
};
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
args->ar_localcaching = 1;
break;
case Opt_debug:
+ if (args->ar_errors == GFS2_ERRORS_PANIC) {
+ fs_info(sdp, "-o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
args->ar_debug = 1;
break;
case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
return rv ? rv : -EINVAL;
}
break;
+ case Opt_err_withdraw:
+ args->ar_errors = GFS2_ERRORS_WITHDRAW;
+ break;
+ case Opt_err_panic:
+ if (args->ar_debug) {
+ fs_info(sdp, "-o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
+ args->ar_errors = GFS2_ERRORS_PANIC;
+ break;
case Opt_error:
default:
fs_info(sdp, "invalid mount option: %s\n", o);
@@ -353,7 +373,7 @@ fail:
return error;
}
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
{
const struct gfs2_statfs_change *str = buf;
@@ -441,6 +461,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
brelse(l_bh);
}
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+ struct buffer_head *l_bh)
+{
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+ struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+ struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+ struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+ gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+
+ spin_lock(&sdp->sd_statfs_spin);
+ m_sc->sc_total += l_sc->sc_total;
+ m_sc->sc_free += l_sc->sc_free;
+ m_sc->sc_dinodes += l_sc->sc_dinodes;
+ memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+ memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+ 0, sizeof(struct gfs2_statfs_change));
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+ gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
+
int gfs2_statfs_sync(struct gfs2_sbd *sdp)
{
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +520,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
if (error)
goto out_bh2;
- gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-
- spin_lock(&sdp->sd_statfs_spin);
- m_sc->sc_total += l_sc->sc_total;
- m_sc->sc_free += l_sc->sc_free;
- m_sc->sc_dinodes += l_sc->sc_dinodes;
- memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
- memset(l_bh->b_data + sizeof(struct gfs2_dinode),
- 0, sizeof(struct gfs2_statfs_change));
- spin_unlock(&sdp->sd_statfs_spin);
-
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
- gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+ update_statfs(sdp, m_bh, l_bh);
gfs2_trans_end(sdp);
@@ -680,6 +711,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
struct gfs2_holder t_gh;
int error;
+ flush_workqueue(gfs2_delete_workqueue);
gfs2_quota_sync(sdp);
gfs2_statfs_sync(sdp);
@@ -756,7 +788,6 @@ restart:
/* Release stuff */
iput(sdp->sd_jindex);
- iput(sdp->sd_inum_inode);
iput(sdp->sd_statfs_inode);
iput(sdp->sd_rindex);
iput(sdp->sd_quota_inode);
@@ -767,10 +798,8 @@ restart:
if (!sdp->sd_args.ar_spectator) {
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
- iput(sdp->sd_ir_inode);
iput(sdp->sd_sc_inode);
iput(sdp->sd_qc_inode);
}
@@ -1072,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
gt->gt_log_flush_secs = args.ar_commit;
spin_unlock(&gt->gt_spin);
+ gfs2_online_uevent(sdp);
return 0;
}
@@ -1213,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
lfsecs = sdp->sd_tune.gt_log_flush_secs;
if (lfsecs != 60)
seq_printf(s, ",commit=%d", lfsecs);
+ if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+ const char *state;
+
+ switch (args->ar_errors) {
+ case GFS2_ERRORS_WITHDRAW:
+ state = "withdraw";
+ break;
+ case GFS2_ERRORS_PANIC:
+ state = "panic";
+ break;
+ default:
+ state = "unknown";
+ break;
+ }
+ seq_printf(s, ",errors=%s", state);
+ }
return 0;
}
@@ -1240,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
goto out;
}
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
+
gfs2_glock_dq_wait(&ip->i_iopen_gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40..235db368288 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
@@ -36,10 +36,14 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+ const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+ struct buffer_head *l_bh);
extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
@@ -50,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027..446329728d5 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <asm/uaccess.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
#include "gfs2.h"
#include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
return ret;
}
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%u\n", ls->ls_id);
-}
-
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
#define GDLM_ATTR(_name,_mode,_show,_store) \
static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
-GDLM_ATTR(block, 0644, block_show, block_store);
-GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
-GDLM_ATTR(id, 0444, lkid_show, NULL);
-GDLM_ATTR(jid, 0444, jid_show, NULL);
-GDLM_ATTR(first, 0444, lkfirst_show, NULL);
-GDLM_ATTR(first_done, 0444, first_done_show, NULL);
-GDLM_ATTR(recover, 0200, NULL, recover_store);
-GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
+GDLM_ATTR(block, 0644, block_show, block_store);
+GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GDLM_ATTR(jid, 0444, jid_show, NULL);
+GDLM_ATTR(first, 0444, lkfirst_show, NULL);
+GDLM_ATTR(first_done, 0444, first_done_show, NULL);
+GDLM_ATTR(recover, 0600, NULL, recover_store);
+GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
static struct attribute *lock_module_attrs[] = {
&gdlm_attr_proto_name.attr,
&gdlm_attr_block.attr,
&gdlm_attr_withdraw.attr,
- &gdlm_attr_id.attr,
&gdlm_attr_jid.attr,
&gdlm_attr_first.attr,
&gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
{
+ struct super_block *sb = sdp->sd_vfs;
int error;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
sdp->sd_kobj.kset = gfs2_kset;
error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
if (error)
goto fail_tune;
- kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+ error = sysfs_create_link(&sdp->sd_kobj,
+ &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+ "device");
+ if (error)
+ goto fail_lock_module;
+
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
return 0;
+fail_lock_module:
+ sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
fail_tune:
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
fail_reg:
@@ -549,12 +557,12 @@ fail:
void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
{
+ sysfs_remove_link(&sdp->sd_kobj, "device");
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
kobject_put(&sdp->sd_kobj);
}
-
static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env)
{
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+ if (!sdp->sd_args.ar_spectator)
+ add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
if (gfs2_uuid_valid(uuid)) {
add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
"%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
.uevent = gfs2_uevent,
};
-
int gfs2_sys_init(void)
{
gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba..f6a7efa34eb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
const struct lm_lockops *lm = ls->ls_ops;
va_list args;
- if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+ test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return 0;
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
- fs_err(sdp, "about to withdraw this file system\n");
- BUG_ON(sdp->sd_args.ar_debug);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+ fs_err(sdp, "about to withdraw this file system\n");
+ BUG_ON(sdp->sd_args.ar_debug);
- kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+ kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
- if (lm->lm_unmount) {
- fs_err(sdp, "telling LM to unmount\n");
- lm->lm_unmount(sdp);
+ if (lm->lm_unmount) {
+ fs_err(sdp, "telling LM to unmount\n");
+ lm->lm_unmount(sdp);
+ }
+ fs_err(sdp, "withdrawn\n");
+ dump_stack();
}
- fs_err(sdp, "withdrawn\n");
- dump_stack();
+
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
return -1;
}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
gfs2_tune_get(sdp, gt_complain_secs) * HZ))
return -2;
- printk(KERN_WARNING
- "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+ printk(KERN_WARNING
+ "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
if (sdp->sd_args.ar_debug)
BUG();
else
dump_stack();
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
+
sdp->sd_last_warning = jiffies;
return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529add..8a0f8ef6ee2 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -38,26 +37,32 @@
* Returns: 1 if the EA should be stuffed
*/
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
unsigned int *size)
{
- *size = GFS2_EAREQ_SIZE_STUFFED(er);
- if (*size <= sdp->sd_jbsize)
+ unsigned int jbsize = sdp->sd_jbsize;
+
+ /* Stuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+ if (*size <= jbsize)
return 1;
- *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+ /* Unstuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+ (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
return 0;
}
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
{
unsigned int size;
- if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+ if (dsize > GFS2_EA_MAX_DATA_LEN)
return -ERANGE;
- ea_calc_size(sdp, er, &size);
+ ea_calc_size(sdp, nsize, dsize, &size);
/* This can only happen with 512 byte blocks */
if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
}
struct ea_find {
- struct gfs2_ea_request *ef_er;
+ int type;
+ const char *name;
+ size_t namel;
struct gfs2_ea_location *ef_el;
};
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
void *private)
{
struct ea_find *ef = private;
- struct gfs2_ea_request *er = ef->ef_er;
if (ea->ea_type == GFS2_EATYPE_UNUSED)
return 0;
- if (ea->ea_type == er->er_type) {
- if (ea->ea_name_len == er->er_name_len &&
- !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+ if (ea->ea_type == ef->type) {
+ if (ea->ea_name_len == ef->namel &&
+ !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
struct gfs2_ea_location *el = ef->ef_el;
get_bh(bh);
el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
return 0;
}
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
struct gfs2_ea_location *el)
{
struct ea_find ef;
int error;
- ef.ef_er = er;
+ ef.type = type;
+ ef.name = name;
+ ef.namel = strlen(name);
ef.ef_el = el;
memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
unsigned int ei_size;
};
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ return 5 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SYS:
+ return 7 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SECURITY:
+ return 9 + ea->ea_name_len + 1;
+ default:
+ return 0;
+ }
+}
+
static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
}
/**
- * gfs2_ea_list -
- * @ip:
- * @er:
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_ea_request er;
struct gfs2_holder i_gh;
int error;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ if (size) {
+ er.er_data = buffer;
+ er.er_data_len = size;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
return error;
if (ip->i_eattr) {
- struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+ struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
error = ea_foreach(ip, ea_list_i, &ei);
if (!error)
@@ -491,84 +517,61 @@ out:
}
int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- char *data)
+ char *data, size_t size)
{
+ int ret;
+ size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+ if (len > size)
+ return -ERANGE;
+
if (GFS2_EA_IS_STUFFED(el->el_ea)) {
- memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
- return 0;
- } else
- return ea_get_unstuffed(ip, el->el_ea, data);
+ memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+ return len;
+ }
+ ret = ea_get_unstuffed(ip, el->el_ea, data);
+ if (ret < 0)
+ return ret;
+ return len;
}
/**
- * gfs2_ea_get_i -
- * @ip: The GFS2 inode
- * @er: The request structure
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
int error;
if (!ip->i_eattr)
return -ENODATA;
+ if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
if (!el.el_ea)
return -ENODATA;
-
- if (er->er_data_len) {
- if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
- error = -ERANGE;
- else
- error = gfs2_ea_get_copy(ip, &el, er->er_data);
- }
- if (!error)
+ if (size)
+ error = gfs2_ea_get_copy(ip, &el, buffer, size);
+ else
error = GFS2_EA_DATA_LEN(el.el_ea);
-
brelse(el.el_bh);
return error;
}
/**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_holder i_gh;
- int error;
-
- if (!er->er_name_len ||
- er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return error;
-
- error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
-}
-
-/**
* ea_alloc_blk - allocates a new block for extended attributes.
* @ip: A pointer to the inode that's getting extended attributes
* @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) ==
- (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
* Returns: errno
*/
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+ const void *data, size_t size)
{
+ struct gfs2_ea_request er;
unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
unsigned int blks = 1;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+ er.er_type = type;
+ er.er_name = name;
+ er.er_name_len = strlen(name);
+ er.er_data = (void *)data;
+ er.er_data_len = size;
+
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+ return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
}
static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto out;
-
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
int stuffed;
int error;
- stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+ stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+ es->es_er->er_data_len, &size);
if (ea->ea_type == GFS2_EATYPE_UNUSED) {
if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
return error;
}
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
- struct gfs2_ea_location *el)
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+ const void *value, size_t size, struct gfs2_ea_location *el)
{
+ struct gfs2_ea_request er;
struct ea_set es;
unsigned int blks = 2;
int error;
+ er.er_type = type;
+ er.er_name = name;
+ er.er_data = (void *)value;
+ er.er_name_len = strlen(name);
+ er.er_data_len = size;
+
memset(&es, 0, sizeof(struct ea_set));
- es.es_er = er;
+ es.es_er = &er;
es.es_el = el;
error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
blks++;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+ return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
}
static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
GFS2_EA2NEXT(el->el_prev) == el->el_ea);
}
- return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
-}
-
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_ea_location el;
- int error;
-
- if (!ip->i_eattr) {
- if (er->er_flags & XATTR_REPLACE)
- return -ENODATA;
- return ea_init(ip, er);
- }
-
- error = gfs2_ea_find(ip, er, &el);
- if (error)
- return error;
-
- if (el.el_ea) {
- if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
- brelse(el.el_bh);
- return -EPERM;
- }
-
- error = -EEXIST;
- if (!(er->er_flags & XATTR_CREATE)) {
- int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
- error = ea_set_i(ip, er, &el);
- if (!error && unstuffed)
- ea_set_remove_unstuffed(ip, &el);
- }
-
- brelse(el.el_bh);
- } else {
- error = -ENODATA;
- if (!(er->er_flags & XATTR_REPLACE))
- error = ea_set_i(ip, er, NULL);
- }
-
- return error;
-}
-
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_holder i_gh;
- int error;
-
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
- error = ea_check_size(GFS2_SB(&ip->i_inode), er);
- if (error)
- return error;
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
- if (error)
- return error;
-
- if (IS_IMMUTABLE(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
+ return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
}
static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
if (GFS2_EA_IS_LAST(ea))
prev->ea_flags |= GFS2_EAFLAG_LAST;
- } else
+ } else {
ea->ea_type = GFS2_EATYPE_UNUSED;
+ }
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
return error;
}
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
int error;
if (!ip->i_eattr)
return -ENODATA;
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
if (GFS2_EA_IS_STUFFED(el.el_ea))
error = ea_remove_stuffed(ip, &el);
else
- error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
- 0);
+ error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
brelse(el.el_bh);
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
}
/**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
- * @ip: pointer to the inode of the target file
- * @er: request information
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
*
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
*/
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags)
{
- struct gfs2_holder i_gh;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_ea_location el;
+ unsigned int namel = strlen(name);
int error;
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+ if (namel > GFS2_EA_MAX_NAME_LEN)
+ return -ERANGE;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (value == NULL)
+ return gfs2_xattr_remove(inode, type, name);
+
+ if (ea_check_size(sdp, namel, size))
+ return -ERANGE;
+
+ if (!ip->i_eattr) {
+ if (flags & XATTR_REPLACE)
+ return -ENODATA;
+ return ea_init(ip, type, name, value, size);
+ }
+
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
- if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+ if (el.el_ea) {
+ if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+ brelse(el.el_bh);
+ return -EPERM;
+ }
- gfs2_glock_dq_uninit(&i_gh);
+ error = -EEXIST;
+ if (!(flags & XATTR_CREATE)) {
+ int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+ error = ea_set_i(ip, type, name, value, size, &el);
+ if (!error && unstuffed)
+ ea_set_remove_unstuffed(ip, &el);
+ }
+
+ brelse(el.el_bh);
+ return error;
+ }
+
+ error = -ENODATA;
+ if (!(flags & XATTR_REPLACE))
+ error = ea_set_i(ip, type, name, value, size, NULL);
return error;
}
@@ -1503,3 +1495,64 @@ out_alloc:
return error;
}
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = gfs2_xattr_user_get,
+ .set = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = gfs2_xattr_security_get,
+ .set = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = gfs2_xattr_system_get,
+ .set = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+ &gfs2_xattr_user_handler,
+ &gfs2_xattr_security_handler,
+ &gfs2_xattr_system_handler,
+ NULL,
+};
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d71..cbdfd774373 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
#define GFS2_EA_SIZE(ea) \
ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
- (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+ (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
#define GFS2_EAREQ_SIZE_STUFFED(er) \
ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
- sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
-
#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
#define GFS2_EA_BH2FIRST(bh) \
((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-#define GFS2_ERF_MODE 0x80000000
-
struct gfs2_ea_request {
const char *er_name;
char *er_data;
unsigned int er_name_len;
unsigned int er_data_len;
unsigned int er_type; /* GFS2_EATYPE_... */
- int er_flags;
- mode_t er_mode;
};
struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
struct gfs2_ea_header *el_prev;
};
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-int gfs2_ea_find(struct gfs2_inode *ip,
- struct gfs2_ea_request *er,
- struct gfs2_ea_location *el);
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
- struct gfs2_ea_location *el,
- char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data);
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
- switch (ea->ea_type) {
- case GFS2_EATYPE_USR:
- return 5 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SYS:
- return 7 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SECURITY:
- return 9 + ea->ea_name_len + 1;
- default:
- return 0;
- }
-}
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+ struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10..a93b885311d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
static struct backing_dev_info hugetlbfs_backing_dev_info = {
+ .name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
@@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+ struct user_struct **user)
{
int error = -ENOMEM;
- int unlock_shm = 0;
struct file *file;
struct inode *inode;
struct dentry *dentry, *root;
struct qstr quick_string;
- struct user_struct *user = current_user();
+ *user = NULL;
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
if (!can_do_hugetlb_shm()) {
- if (user_shm_lock(size, user)) {
- unlock_shm = 1;
+ *user = current_user();
+ if (user_shm_lock(size, *user)) {
WARN_ONCE(1,
"Using mlock ulimits for SHM_HUGETLB deprecated\n");
- } else
+ } else {
+ *user = NULL;
return ERR_PTR(-EPERM);
+ }
}
root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +999,10 @@ out_inode:
out_dentry:
dput(dentry);
out_shm_unlock:
- if (unlock_shm)
- user_shm_unlock(size, user);
+ if (*user) {
+ user_shm_unlock(size, *user);
+ *user = NULL;
+ }
return ERR_PTR(error);
}
diff --git a/fs/inode.c b/fs/inode.c
index 901bad1e5f1..b2ba83d2c4e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
*/
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static const struct file_operations empty_fops;
-
struct address_space *const mapping = &inode->i_data;
inode->i_sb = sb;
@@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
inode->dirtied_when = 0;
if (security_inode_alloc(inode))
- goto out_free_inode;
+ goto out;
/* allocate and initialize an i_integrity */
if (ima_inode_alloc(inode))
@@ -183,9 +182,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
if (sb->s_bdev) {
struct backing_dev_info *bdi;
- bdi = sb->s_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
inode->i_private = NULL;
@@ -198,16 +195,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
- return inode;
+ return 0;
out_free_security:
security_inode_free(inode);
-out_free_inode:
- if (inode->i_sb->s_op->destroy_inode)
- inode->i_sb->s_op->destroy_inode(inode);
- else
- kmem_cache_free(inode_cachep, (inode));
- return NULL;
+out:
+ return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
@@ -220,12 +213,21 @@ static struct inode *alloc_inode(struct super_block *sb)
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
- if (inode)
- return inode_init_always(sb, inode);
- return NULL;
+ if (!inode)
+ return NULL;
+
+ if (unlikely(inode_init_always(sb, inode))) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, inode);
+ return NULL;
+ }
+
+ return inode;
}
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
ima_inode_free(inode);
@@ -237,13 +239,17 @@ void destroy_inode(struct inode *inode)
if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_default_acl);
#endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+
+void destroy_inode(struct inode *inode)
+{
+ __destroy_inode(inode);
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, (inode));
}
-EXPORT_SYMBOL(destroy_inode);
-
/*
* These are initializations that only need to be done
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868c..b0435dd0654 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned int blocknr, freed;
if (is_journal_aborted(journal))
return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
+ "Cleaning journal tail from %d to %d (offset %u), "
+ "freeing %u\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a..4bd882548c4 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
int bufs;
int flags;
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b..bd3c073b485 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
int journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
- unsigned long blocknr)
+ unsigned int blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
+ journal_t *journal = transaction->t_journal;
/*
* The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+ /* keep subsequent assertions sane */
+ new_bh->b_state = 0;
+ init_buffer(new_bh, NULL, NULL);
+ atomic_set(&new_bh->b_count, 1);
+ new_jh = journal_add_journal_head(new_bh); /* This sleeps */
/*
* If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
kunmap_atomic(mapped_data, KM_USER0);
}
- /* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
- atomic_set(&new_bh->b_count, 1);
- jbd_unlock_bh_state(bh_in);
-
- new_jh = journal_add_journal_head(new_bh); /* This sleeps */
-
set_bh_page(new_bh, new_page, new_offset);
new_jh->b_transaction = NULL;
new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
* copying is moved to the transaction's shadow queue.
*/
JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
- journal_file_buffer(jh_in, transaction, BJ_Shadow);
+ spin_lock(&journal->j_list_lock);
+ __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh_in);
+
JBUFFER_TRACE(new_jh, "file as BJ_IO");
journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -565,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
* Log buffer allocation routines:
*/
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
{
- unsigned long blocknr;
+ unsigned int blocknr;
spin_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);
@@ -588,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
* this is a no-op. If needed, we can use j_blk_offset - everything is
* ready.
*/
-int journal_bmap(journal_t *journal, unsigned long blocknr,
- unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+ unsigned int *retp)
{
int err = 0;
- unsigned long ret;
+ unsigned int ret;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
@@ -602,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
char b[BDEVNAME_SIZE];
printk(KERN_ALERT "%s: journal block not found "
- "at offset %lu on %s\n",
+ "at offset %u on %s\n",
__func__,
blocknr,
bdevname(journal->j_dev, b));
@@ -628,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
err = journal_next_log_block(journal, &blocknr);
@@ -772,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
journal_t *journal = journal_init_common();
int err;
int n;
- unsigned long blocknr;
+ unsigned int blocknr;
if (!journal)
return NULL;
@@ -844,10 +846,16 @@ static void journal_fail_superblock (journal_t *journal)
static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
- unsigned long first, last;
+ unsigned int first, last;
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
@@ -877,7 +885,7 @@ static int journal_reset(journal_t *journal)
**/
int journal_create(journal_t *journal)
{
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
journal_superblock_t *sb;
int i, err;
@@ -961,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %ld, seq %d, errno %d)\n",
+ "(start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
goto out;
}
spin_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1363,7 +1371,7 @@ int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
+ unsigned int old_tail;
spin_lock(&journal->j_state_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5dd..cb1a49ae605 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned int next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ jbd_debug(3, "JBD: checking block %u\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
- unsigned long io_block;
+ unsigned int io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
- "block %ld in log\n",
+ "block %u in log\n",
err, io_block);
} else {
- unsigned long blocknr;
+ unsigned int blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
max = be32_to_cpu(header->r_count);
while (offset < max) {
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaab..ad717328343 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
- unsigned long blocknr;
+ unsigned int blocknr;
};
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
tid_t seq)
{
struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
- unsigned long blocknr)
+ unsigned int blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
* by one.
*/
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
}
}
- jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
*/
int journal_set_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
*/
int journal_test_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b..006f9ad838a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires =
+ round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
-
out:
return handle;
}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
@@ -489,34 +490,15 @@ void journal_unlock_updates (journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
}
-/*
- * Report any unexpected dirty buffers which turn up. Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible. #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+static void warn_dirty_buffer(struct buffer_head *bh)
{
- int jlist;
-
- /* If this buffer is one which might reasonably be dirty
- * --- ie. data, or not part of this journal --- then
- * we're OK to leave it alone, but otherwise we need to
- * move the dirty bit to the journal's own internal
- * JBDDirty bit. */
- jlist = jh->b_jlist;
-
- if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
- jlist == BJ_Shadow || jlist == BJ_Forget) {
- struct buffer_head *bh = jh2bh(jh);
+ char b[BDEVNAME_SIZE];
- if (test_clear_buffer_dirty(bh))
- set_buffer_jbddirty(bh);
- }
+ printk(KERN_WARNING
+ "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "There's a risk of filesystem corruption in case of system "
+ "crash.\n",
+ bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
/*
@@ -583,14 +565,16 @@ repeat:
if (jh->b_next_transaction)
J_ASSERT_JH(jh, jh->b_next_transaction ==
transaction);
+ warn_dirty_buffer(bh);
}
/*
* In any case we need to clean the dirty flag and we must
* do it under the buffer lock to be sure we don't race
* with running write-out.
*/
- JBUFFER_TRACE(jh, "Unexpected dirty buffer");
- jbd_unexpected_dirty_buffer(jh);
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ clear_buffer_dirty(bh);
+ set_buffer_jbddirty(bh);
}
unlock_buffer(bh);
@@ -826,6 +810,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
if (jh->b_transaction == NULL) {
+ /*
+ * Previous journal_forget() could have left the buffer
+ * with jbddirty bit set because it was being committed. When
+ * the commit finished, we've filed the buffer for
+ * checkpointing and marked it dirty. Now we are reallocating
+ * the buffer so the transaction freeing it must have
+ * committed and so it's safe to clear the dirty bit.
+ */
+ clear_buffer_dirty(jh2bh(jh));
jh->b_transaction = transaction;
/* first access by this transaction */
@@ -1782,8 +1775,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
if (jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "on running+cp transaction");
+ /*
+ * We don't want to write the buffer anymore, clear the
+ * bit so that we don't confuse checks in
+ * __journal_file_buffer
+ */
+ clear_buffer_dirty(bh);
__journal_file_buffer(jh, transaction, BJ_Forget);
- clear_buffer_jbddirty(bh);
may_free = 0;
} else {
JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2039,17 @@ void __journal_file_buffer(struct journal_head *jh,
if (jh->b_transaction && jh->b_jlist == jlist)
return;
- /* The following list of buffer states needs to be consistent
- * with __jbd_unexpected_dirty_buffer()'s handling of dirty
- * state. */
-
if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
jlist == BJ_Shadow || jlist == BJ_Forget) {
+ /*
+ * For metadata buffers, we track dirty bit in buffer_jbddirty
+ * instead of buffer_dirty. We should not see a dirty bit set
+ * here because we clear it in do_get_write_access but e.g.
+ * tune2fs can modify the sb and set the dirty bit at any time
+ * so we try to gracefully handle that.
+ */
+ if (buffer_dirty(bh))
+ warn_dirty_buffer(bh);
if (test_clear_buffer_dirty(bh) ||
test_clear_buffer_jbddirty(bh))
was_dirty = 1;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364..26d991ddc1e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ !JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
set_buffer_ordered(bh);
barrier_done = 1;
}
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
.nr_to_write = mapping->nrpages * 2,
.range_start = 0,
.range_end = i_size_read(mapping->host),
- .for_writepages = 1,
};
ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
+ if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(journal->j_dev, NULL);
}
/*
@@ -834,7 +836,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb38397..a8a358bc0f2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f3..a0512700542 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
__jbd2_log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
__jbd2_log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218..7edb62e9741 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return rc;
}
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl;
int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jffs2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jffs2_check_acl);
-}
-
int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
{
struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f..f0ba63e3c36 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
extern int jffs2_acl_chmod(struct inode *);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
#else
-#define jffs2_permission (NULL)
+#define jffs2_check_acl (NULL)
#define jffs2_acl_chmod(inode) (0)
#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4..7aa4417e085 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf2058..b7b74e29914 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
kunmap(pg);
D2(printk(KERN_DEBUG "readpage finished\n"));
- return 0;
+ return ret;
}
int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad..4ec11e8bda8 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
.follow_link = jffs2_follow_link,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db7..5ef7bac265e 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
if (!c->wbuf)
return -ENOMEM;
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+ c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+ if (!c->wbuf_verify) {
+ kfree(c->wbuf);
+ return -ENOMEM;
+ }
+#endif
return 0;
}
void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+ kfree(c->wbuf_verify);
+#endif
kfree(c->wbuf);
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c..d66477c3430 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
acl = posix_acl_from_xattr(value, size);
}
kfree(value);
- if (!IS_ERR(acl)) {
+ if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
- posix_acl_release(acl);
- }
return acl;
}
@@ -116,7 +114,7 @@ out:
return rc;
}
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
@@ -131,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jfs_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jfs_check_acl);
-}
-
int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
{
struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3..2b70fa78e4a 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a38..b07bd417ef8 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92..c79a4270f08 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3..dcec3d3ea64 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
return PTR_ERR(s);
s->s_flags = MS_NOUSER;
- s->s_maxbytes = ~0ULL;
+ s->s_maxbytes = MAX_LFS_FILESIZE;
s->s_blocksize = PAGE_SIZE;
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd432..7cb076ac6b4 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
return hash & (NLM_HOST_NRHASH - 1);
}
-static void nlm_clear_port(struct sockaddr *sap)
-{
- switch (sap->sa_family) {
- case AF_INET:
- ((struct sockaddr_in *)sap)->sin_port = 0;
- break;
- case AF_INET6:
- ((struct sockaddr_in6 *)sap)->sin6_port = 0;
- break;
- }
-}
-
/*
* Common host lookup routine for server & client
*/
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_addrbuf = nsm->sm_addrbuf;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
- nlm_clear_port(nlm_addr(host));
+ rpc_set_port(nlm_addr(host), 0);
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
host->h_version = ni->version;
host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b52584..30c933188dd 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
return (struct sockaddr *)&nsm->sm_addr;
}
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
- if (ipv6_addr_v4mapped(&sin6->sin6_addr))
- snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
- else if (sin6->sin6_scope_id != 0)
- snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
- sin6->sin6_scope_id);
- else
- snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nsm_display_address(const struct sockaddr *sap,
- char *buf, const size_t len)
-{
- switch (sap->sa_family) {
- case AF_INET:
- nsm_display_ipv4_address(sap, buf, len);
- break;
- case AF_INET6:
- nsm_display_ipv6_address(sap, buf, len);
- break;
- default:
- snprintf(buf, len, "unsupported address family");
- break;
- }
-}
-
static struct rpc_clnt *nsm_create(void)
{
struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
memcpy(nsm_addr(new), sap, salen);
new->sm_addrlen = salen;
nsm_init_private(new);
- nsm_display_address((const struct sockaddr *)&new->sm_addr,
- new->sm_addrbuf, sizeof(new->sm_addrbuf));
+
+ if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+ sizeof(new->sm_addrbuf)) == 0)
+ (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+ "unsupported address family");
memcpy(new->sm_name, hostname, hostname_len);
new->sm_name[hostname_len] = '\0';
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178..19ee18a6829 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file.
*/
if (found)
- cond_resched_bkl();
+ cond_resched();
find_conflict:
for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (can_sleep)
lock->fl_flags |= FL_SLEEP;
- error = security_file_lock(filp, cmd);
+ error = security_file_lock(filp, lock->fl_type);
if (error)
goto out_free;
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895..d11f404667e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
EXPORT_SYMBOL(putname);
#endif
-
-/**
- * generic_permission - check for access rights on a Posix-like filesystem
- * @inode: inode to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl: optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
+/*
+ * This does basic POSIX ACL permission checking
*/
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
int (*check_acl)(struct inode *inode, int mask))
{
umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
int error = check_acl(inode, mask);
- if (error == -EACCES)
- goto check_capabilities;
- else if (error != -EAGAIN)
+ if (error != -EAGAIN)
return error;
}
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
*/
if ((mask & ~mode) == 0)
return 0;
+ return -EACCES;
+}
+
+/**
+ * generic_permission - check for access rights on a Posix-like filesystem
+ * @inode: inode to check access rights for
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl: optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+ int (*check_acl)(struct inode *inode, int mask))
+{
+ int ret;
+
+ /*
+ * Do the basic POSIX ACL permission checks.
+ */
+ ret = acl_permission_check(inode, mask, check_acl);
+ if (ret != -EACCES)
+ return ret;
- check_capabilities:
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
if (inode->i_op->permission)
retval = inode->i_op->permission(inode, mask);
else
- retval = generic_permission(inode, mask, NULL);
+ retval = generic_permission(inode, mask, inode->i_op->check_acl);
if (retval)
return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
*/
static int exec_permission_lite(struct inode *inode)
{
- umode_t mode = inode->i_mode;
+ int ret;
- if (inode->i_op->permission)
- return -EAGAIN;
-
- if (current_fsuid() == inode->i_uid)
- mode >>= 6;
- else if (in_group_p(inode->i_gid))
- mode >>= 3;
-
- if (mode & MAY_EXEC)
- goto ok;
-
- if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
- goto ok;
-
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
+ if (inode->i_op->permission) {
+ ret = inode->i_op->permission(inode, MAY_EXEC);
+ if (!ret)
+ goto ok;
+ return ret;
+ }
+ ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+ if (!ret)
goto ok;
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+ if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
goto ok;
- return -EACCES;
+ return ret;
ok:
return security_inode_permission(inode, MAY_EXEC);
}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
- if (err == -EAGAIN)
- err = inode_permission(nd->path.dentry->d_inode,
- MAY_EXEC);
- if (!err)
- err = ima_path_check(&nd->path, MAY_EXEC,
- IMA_COUNT_UPDATE);
if (err)
break;
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
if (error)
return error;
- error = ima_path_check(path,
- acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+ error = ima_path_check(path, acc_mode ?
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+ ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
IMA_COUNT_UPDATE);
+
if (error)
return error;
/*
* An append-only file must be opened in append mode for writing.
*/
if (IS_APPEND(inode)) {
+ error = -EPERM;
if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
- return -EPERM;
+ goto err_out;
if (flag & O_TRUNC)
- return -EPERM;
+ goto err_out;
}
/* O_NOATIME can only be set by the owner or superuser */
if (flag & O_NOATIME)
- if (!is_owner_or_cap(inode))
- return -EPERM;
+ if (!is_owner_or_cap(inode)) {
+ error = -EPERM;
+ goto err_out;
+ }
/*
* Ensure there are no outstanding leases on the file.
*/
error = break_lease(inode, flag);
if (error)
- return error;
+ goto err_out;
if (flag & O_TRUNC) {
error = get_write_access(inode);
if (error)
- return error;
+ goto err_out;
/*
* Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
}
put_write_access(inode);
if (error)
- return error;
+ goto err_out;
} else
if (flag & FMODE_WRITE)
vfs_dq_init(inode);
return 0;
+err_out:
+ ima_counts_put(path, acc_mode ?
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+ ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+ return error;
}
/*
diff --git a/fs/namespace.c b/fs/namespace.c
index 277c28a63ea..7230787d18b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
*/
int mnt_want_write_file(struct file *file)
{
- if (!(file->f_mode & FMODE_WRITE))
+ struct inode *inode = file->f_dentry->d_inode;
+ if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
return mnt_want_write(file->f_path.mnt);
else
return mnt_clone_write(file->f_path.mnt);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de..da7fda639ea 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
direct.o pagelist.o proc.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o
+ write.o namespace.o mount_clnt.o \
+ dns_resolve.o cache_lib.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 00000000000..b4ffd0146ea
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (atomic_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete_all(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ atomic_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ atomic_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ int ret;
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+ if (ret)
+ goto err;
+ ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+ cd->name, 0600, cd);
+ path_put(&nd.path);
+ if (!ret)
+ return ret;
+err:
+ rpc_put_mount();
+ return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+ rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 00000000000..76f856e284e
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941f..293fa0528a6 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
{
- char *endp;
- int num = simple_strtol(val, &endp, 0);
- if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = strict_strtoul(val, 0, &num);
+ if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
return -EINVAL;
- *((int *)kp->arg) = num;
+ *((unsigned int *)kp->arg) = num;
return 0;
}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
- &nfs_callback_set_tcpport, 0644);
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
+{
+ return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
/*
* This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51..e350bd6a233 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->options = data->options;
+ server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
if (server->wsize > max_rpc_payload)
@@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
server->options = data->options;
/* Get a client record */
@@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Initialise the client representation from the parent server */
nfs_server_copy_userdata(server, parent_server);
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a320..6c3210099d5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
if (put_dreq(dreq))
nfs_direct_complete(dreq);
- nfs_readdata_release(calldata);
+ nfs_readdata_free(data);
}
static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->npages, 1, 0, data->pagevec, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0) {
- nfs_readdata_release(data);
+ nfs_readdata_free(data);
break;
}
if ((unsigned)result < data->npages) {
bytes = result * PAGE_SIZE;
if (bytes <= pgbase) {
nfs_direct_release_pages(data->pagevec, result);
- nfs_readdata_release(data);
+ nfs_readdata_free(data);
break;
}
bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->inode = inode;
data->cred = msg.rpc_cred;
data->args.fh = NFS_FH(inode);
- data->args.context = get_nfs_open_context(ctx);
+ data->args.context = ctx;
data->args.offset = pos;
data->args.pgbase = pgbase;
data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
list_del(&data->pages);
nfs_direct_release_pages(data->pagevec, data->npages);
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
}
}
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
nfs_direct_write_complete(dreq, data->inode);
- nfs_commitdata_release(calldata);
+ nfs_commit_free(data);
}
static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
data->args.fh = NFS_FH(data->inode);
data->args.offset = 0;
data->args.count = 0;
- data->args.context = get_nfs_open_context(dreq->ctx);
+ data->args.context = dreq->ctx;
data->res.count = 0;
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->npages, 0, 0, data->pagevec, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0) {
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
break;
}
if ((unsigned)result < data->npages) {
bytes = result * PAGE_SIZE;
if (bytes <= pgbase) {
nfs_direct_release_pages(data->pagevec, result);
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
break;
}
bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->inode = inode;
data->cred = msg.rpc_cred;
data->args.fh = NFS_FH(inode);
- data->args.context = get_nfs_open_context(ctx);
+ data->args.context = ctx;
data->args.offset = pos;
data->args.pgbase = pgbase;
data->args.pages = data->pagevec;
@@ -934,9 +934,6 @@ out:
* back into its cache. We let the server do generic write
* parameter checking and report problems.
*
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
* We eliminate local atime updates, see direct read above.
*
* We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 00000000000..f4d54ba97cc
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+};
+
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+ int ret;
+
+ ret = nfs_cache_upcall(cd, key->hostname);
+ if (ret)
+ ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = (long)item->h.expiry_time - (long)get_seconds();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned long ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ ttl = get_expiry(&buf);
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + get_seconds();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .hash_table = nfs_dns_table,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_init,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < get_seconds()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+
+ ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(sa, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, &nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+ return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ nfs_cache_unregister(&nfs_dns_resolve);
+}
+
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 00000000000..a3f0938babf
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b67..5021b75d2d1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
}
/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned len)
+{
+ unsigned int pglen = nfs_page_length(page);
+ unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int end = offset + len;
+
+ if ((file->f_mode & FMODE_READ) && /* open for read? */
+ !PageUptodate(page) && /* Uptodate? */
+ !PagePrivate(page) && /* i/o request already? */
+ pglen && /* valid bytes of file? */
+ (end < pglen || offset)) /* replace all valid bytes? */
+ return 1;
+ return 0;
+}
+
+/*
* This does the "real" work of the write. We must allocate and lock the
* page to be sent back to the generic routine, which then copies the
* data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int ret;
- pgoff_t index;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
- index = pos >> PAGE_CACHE_SHIFT;
+ int once_thru = 0;
dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+start:
/*
* Prevent starvation issues if someone is doing a consistency
* sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
if (ret) {
unlock_page(page);
page_cache_release(page);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+ ret = nfs_readpage(file, page);
+ page_cache_release(page);
+ if (!ret)
+ goto start;
}
return ret;
}
@@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = {
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
+ .migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
};
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2c..21a84d45916 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
static unsigned int fnvhash32(const void *, size_t);
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
.upcall = idmap_pipe_upcall,
.downcall = idmap_pipe_downcall,
.destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
if (idmap == NULL)
return -ENOMEM;
- idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
- idmap, &idmap_upcall_ops, 0);
+ idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+ "idmap", idmap, &idmap_upcall_ops, 0);
if (IS_ERR(idmap->idmap_dentry)) {
error = PTR_ERR(idmap->idmap_dentry);
kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a..060022b4651 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "dns_resolve.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
/* We can't support update_atime(), since the server will reset it */
inode->i_flags |= S_NOATIME|S_NOCMTIME;
inode->i_mode = fattr->mode;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && nfs_server_capable(inode, NFS_CAP_MODE))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
+ else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
+ else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
+ else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
nfsi->change_attr = fattr->change_attr;
+ else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE;
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
inode->i_nlink = fattr->nlink;
+ else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
loff_t cur_isize, new_isize;
unsigned long invalid = 0;
unsigned long now = jiffies;
+ unsigned long save_cache_validity;
dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
nfsi->change_attr = fattr->change_attr;
}
- }
+ } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+ invalid |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
/* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
}
- }
+ } else if (server->caps & NFS_CAP_MTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
/* If ctime has changed we should definitely clear access+acl caches */
if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
}
- }
+ } else if (server->caps & NFS_CAP_CTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dprintk("NFS: isize change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
}
- }
+ } else
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+ else if (server->caps & NFS_CAP_ATIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_mode = fattr->mode;
}
- }
+ } else if (server->caps & NFS_CAP_MODE)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (inode->i_uid != fattr->uid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (inode->i_gid != fattr->gid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
inode->i_nlink = fattr->nlink;
}
- }
+ } else if (server->caps & NFS_CAP_NLINK)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
@@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
|| S_ISLNK(inode->i_mode)))
invalid &= ~NFS_INO_INVALID_DATA;
if (!nfs_have_delegation(inode, FMODE_READ) ||
- (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+ (save_cache_validity & NFS_INO_REVAL_FORCED))
nfsi->cache_validity |= invalid;
- nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
return 0;
out_changed:
@@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void)
{
int err;
+ err = nfs_dns_resolver_init();
+ if (err < 0)
+ goto out8;
+
err = nfs_fscache_register();
if (err < 0)
goto out7;
@@ -1500,6 +1569,8 @@ out5:
out6:
nfs_fscache_unregister();
out7:
+ nfs_dns_resolver_destroy();
+out8:
return err;
}
@@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
+ nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister("nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d..e21b1bb9972 100644
--- a/fs/nfs/internal.h
+++ b/