aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig3
-rw-r--r--fs/9p/fid.c17
-rw-r--r--fs/9p/v9fs.c34
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/vfs_dir.c92
-rw-r--r--fs/9p/vfs_file.c5
-rw-r--r--fs/9p/vfs_inode.c9
-rw-r--r--fs/9p/vfs_inode_dotl.c21
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/adfs/Kconfig4
-rw-r--r--fs/affs/Kconfig4
-rw-r--r--fs/afs/Kconfig7
-rw-r--r--fs/afs/afs.h11
-rw-r--r--fs/afs/fsclient.c14
-rw-r--r--fs/afs/inode.c6
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/aio.c7
-rw-r--r--fs/befs/Kconfig4
-rw-r--r--fs/bfs/Kconfig4
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/extent-tree.c34
-rw-r--r--fs/btrfs/extent_map.c14
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c35
-rw-r--r--fs/btrfs/free-space-cache.c20
-rw-r--r--fs/btrfs/inode.c137
-rw-r--r--fs/btrfs/ioctl.c134
-rw-r--r--fs/btrfs/ordered-data.c13
-rw-r--r--fs/btrfs/qgroup.c20
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/transaction.c47
-rw-r--r--fs/btrfs/tree-log.c10
-rw-r--r--fs/btrfs/volumes.c26
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/ceph/Kconfig4
-rw-r--r--fs/ceph/caps.c17
-rw-r--r--fs/ceph/inode.c23
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/cifs/Kconfig8
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifs_fs_sb.h8
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c47
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/cifsglob.h22
-rw-r--r--fs/cifs/cifspdu.h1
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c10
-rw-r--r--fs/cifs/connect.c68
-rw-r--r--fs/cifs/dir.c18
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/inode.c50
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/coda/cache.c4
-rw-r--r--fs/coda/coda_fs_i.h2
-rw-r--r--fs/coda/coda_linux.c8
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/coda/psdev.c7
-rw-r--r--fs/coda/upcall.c10
-rw-r--r--fs/compat.c52
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c18
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c15
-rw-r--r--fs/dlm/user.c8
-rw-r--r--fs/ecryptfs/Kconfig4
-rw-r--r--fs/efs/Kconfig4
-rw-r--r--fs/ext2/balloc.c28
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext3/inode.c16
-rw-r--r--fs/ext3/namei.c1
-rw-r--r--fs/ext3/resize.c12
-rw-r--r--fs/ext3/super.c52
-rw-r--r--fs/ext3/xattr.c4
-rw-r--r--fs/ext4/acl.c7
-rw-r--r--fs/ext4/balloc.c13
-rw-r--r--fs/ext4/dir.c1
-rw-r--r--fs/ext4/ext4.h123
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/ext4_jbd2.c102
-rw-r--r--fs/ext4/ext4_jbd2.h51
-rw-r--r--fs/ext4/extents.c312
-rw-r--r--fs/ext4/extents_status.c631
-rw-r--r--fs/ext4/extents_status.h83
-rw-r--r--fs/ext4/file.c16
-rw-r--r--fs/ext4/hash.c6
-rw-r--r--fs/ext4/ialloc.c29
-rw-r--r--fs/ext4/indirect.c259
-rw-r--r--fs/ext4/inline.c12
-rw-r--r--fs/ext4/inode.c666
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc.c69
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/migrate.c15
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/move_extent.c10
-rw-r--r--fs/ext4/namei.c498
-rw-r--r--fs/ext4/page-io.c85
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c485
-rw-r--r--fs/ext4/xattr.c23
-rw-r--r--fs/ext4/xattr.h68
-rw-r--r--fs/f2fs/acl.c13
-rw-r--r--fs/f2fs/checkpoint.c66
-rw-r--r--fs/f2fs/data.c17
-rw-r--r--fs/f2fs/debug.c54
-rw-r--r--fs/f2fs/dir.c31
-rw-r--r--fs/f2fs/f2fs.h62
-rw-r--r--fs/f2fs/file.c51
-rw-r--r--fs/f2fs/gc.c158
-rw-r--r--fs/f2fs/gc.h21
-rw-r--r--fs/f2fs/inode.c56
-rw-r--r--fs/f2fs/node.c39
-rw-r--r--fs/f2fs/recovery.c24
-rw-r--r--fs/f2fs/segment.c31
-rw-r--r--fs/f2fs/segment.h23
-rw-r--r--fs/f2fs/super.c177
-rw-r--r--fs/f2fs/xattr.c2
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/cuse.c46
-rw-r--r--fs/fuse/dev.c133
-rw-r--r--fs/fuse/dir.c259
-rw-r--r--fs/fuse/file.c243
-rw-r--r--fs/fuse/fuse_i.h74
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c32
-rw-r--r--fs/gfs2/dir.c32
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c116
-rw-r--r--fs/gfs2/glops.c4
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c40
-rw-r--r--fs/gfs2/lock_dlm.c7
-rw-r--r--fs/gfs2/log.c76
-rw-r--r--fs/gfs2/log.h12
-rw-r--r--fs/gfs2/lops.c83
-rw-r--r--fs/gfs2/lops.h14
-rw-r--r--fs/gfs2/meta_io.c35
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/quota.c142
-rw-r--r--fs/gfs2/quota.h15
-rw-r--r--fs/gfs2/rgrp.c18
-rw-r--r--fs/gfs2/super.c76
-rw-r--r--fs/gfs2/super.h3
-rw-r--r--fs/gfs2/sys.c62
-rw-r--r--fs/gfs2/trans.c124
-rw-r--r--fs/gfs2/trans.h3
-rw-r--r--fs/gfs2/util.c3
-rw-r--r--fs/gfs2/xattr.c40
-rw-r--r--fs/hfs/Kconfig4
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/jbd2/commit.c8
-rw-r--r--fs/jbd2/journal.c66
-rw-r--r--fs/jbd2/transaction.c29
-rw-r--r--fs/jffs2/Kconfig10
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/logfs/Kconfig4
-rw-r--r--fs/ncpfs/inode.c55
-rw-r--r--fs/ncpfs/ioctl.c25
-rw-r--r--fs/ncpfs/ncp_fs_sb.h6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/callback_proc.c61
-rw-r--r--fs/nfs/delegation.c154
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/idmap.c53
-rw-r--r--fs/nfs/inode.c17
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs2xdr.c19
-rw-r--r--fs/nfs/nfs3xdr.c18
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c62
-rw-r--r--fs/nfs/nfs4proc.c133
-rw-r--r--fs/nfs/nfs4state.c33
-rw-r--r--fs/nfs/nfs4xdr.c16
-rw-r--r--fs/nfs/objlayout/objio_osd.c1
-rw-r--r--fs/nfs/pnfs.c150
-rw-r--r--fs/nfs/pnfs.h7
-rw-r--r--fs/nfs/super.c71
-rw-r--r--fs/nfs/unlink.c5
-rw-r--r--fs/nfs_common/nfsacl.c41
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/acl.h2
-rw-r--r--fs/nfsd/auth.c12
-rw-r--r--fs/nfsd/auth.h6
-rw-r--r--fs/nfsd/export.c22
-rw-r--r--fs/nfsd/idmap.h8
-rw-r--r--fs/nfsd/nfs3xdr.c14
-rw-r--r--fs/nfsd/nfs4acl.c63
-rw-r--r--fs/nfsd/nfs4idmap.c38
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c12
-rw-r--r--fs/nfsd/nfs4xdr.c54
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/nfsd/nfsxdr.c14
-rw-r--r--fs/nfsd/state.h4
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nilfs2/Kconfig3
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/ioctl.c5
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/ocfs2/acl.c31
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c8
-rw-r--r--fs/ocfs2/dir.c1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlmglue.c13
-rw-r--r--fs/ocfs2/extent_map.c3
-rw-r--r--fs/ocfs2/file.c11
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/journal.c10
-rw-r--r--fs/ocfs2/localalloc.c8
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/proc/Makefile3
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/proc_net.c14
-rw-r--r--fs/pstore/inode.c18
-rw-r--r--fs/pstore/platform.c35
-rw-r--r--fs/pstore/ram.c10
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/select.c1
-rw-r--r--fs/sysfs/group.c42
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysfs/symlink.c45
-rw-r--r--fs/sysfs/sysfs.h2
-rw-r--r--fs/timerfd.c85
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/orphan.c12
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/inode.c86
-rw-r--r--fs/udf/super.c11
-rw-r--r--fs/udf/udf_i.h16
-rw-r--r--fs/udf/udf_sb.h5
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/ufs/Kconfig2
-rw-r--r--fs/xfs/Kconfig4
-rw-r--r--fs/xfs/xfs_alloc.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr.c9
-rw-r--r--fs/xfs/xfs_bmap.c124
-rw-r--r--fs/xfs/xfs_buf.c22
-rw-r--r--fs/xfs/xfs_buf_item.c130
-rw-r--r--fs/xfs/xfs_buf_item.h14
-rw-r--r--fs/xfs/xfs_dfrag.c4
-rw-r--r--fs/xfs/xfs_dquot.c12
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c16
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_iomap.c86
-rw-r--r--fs/xfs/xfs_log.c10
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c28
-rw-r--r--fs/xfs/xfs_super.c29
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c376
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c14
-rw-r--r--fs/xfs/xfs_trans_dquot.c10
-rw-r--r--fs/xfs/xfs_trans_inode.c41
-rw-r--r--fs/xfs/xfs_types.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c12
298 files changed, 6263 insertions, 4354 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 0a93dc1cb4a..55abfd62654 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -11,8 +11,7 @@ config 9P_FS
if 9P_FS
config 9P_FSCACHE
- bool "Enable 9P client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Enable 9P client caching support"
depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
help
Choose Y here to enable persistent, read-only local
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index da8eefbe830..afd4724b2d9 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -74,19 +74,20 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
*
*/
-static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
{
struct v9fs_dentry *dent;
struct p9_fid *fid, *ret;
p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, uid, any);
+ dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+ any);
dent = (struct v9fs_dentry *) dentry->d_fsdata;
ret = NULL;
if (dent) {
spin_lock(&dent->lock);
list_for_each_entry(fid, &dent->fidlist, dlist) {
- if (any || fid->uid == uid) {
+ if (any || uid_eq(fid->uid, uid)) {
ret = fid;
break;
}
@@ -126,7 +127,7 @@ err_out:
}
static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- uid_t uid, int any)
+ kuid_t uid, int any)
{
struct dentry *ds;
char **wnames, *uname;
@@ -233,7 +234,7 @@ err_out:
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
{
- uid_t uid;
+ kuid_t uid;
int any, access;
struct v9fs_session_info *v9ses;
@@ -253,7 +254,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
break;
default:
- uid = ~0;
+ uid = INVALID_UID;
any = 0;
break;
}
@@ -272,7 +273,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
return ret;
}
-static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
{
struct p9_fid *fid, *ret;
@@ -289,7 +290,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
int err;
struct p9_fid *fid;
- fid = v9fs_fid_clone_with_uid(dentry, 0);
+ fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
if (IS_ERR(fid))
goto error_out;
/*
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d934f04e773..58e6cbce415 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
ret = r;
continue;
}
- v9ses->dfltuid = option;
+ v9ses->dfltuid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(v9ses->dfltuid)) {
+ p9_debug(P9_DEBUG_ERROR,
+ "uid field, but not a uid?\n");
+ ret = -EINVAL;
+ continue;
+ }
break;
case Opt_dfltgid:
r = match_int(&args[0], &option);
@@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
ret = r;
continue;
}
- v9ses->dfltgid = option;
+ v9ses->dfltgid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(v9ses->dfltgid)) {
+ p9_debug(P9_DEBUG_ERROR,
+ "gid field, but not a gid?\n");
+ ret = -EINVAL;
+ continue;
+ }
break;
case Opt_afid:
r = match_int(&args[0], &option);
@@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
else if (strcmp(s, "client") == 0) {
v9ses->flags |= V9FS_ACCESS_CLIENT;
} else {
+ uid_t uid;
v9ses->flags |= V9FS_ACCESS_SINGLE;
- v9ses->uid = simple_strtoul(s, &e, 10);
+ uid = simple_strtoul(s, &e, 10);
if (*e != '\0') {
ret = -EINVAL;
pr_info("Unknown access argument %s\n",
@@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
kfree(s);
goto free_and_return;
}
+ v9ses->uid = make_kuid(current_user_ns(), uid);
+ if (!uid_valid(v9ses->uid)) {
+ ret = -EINVAL;
+ pr_info("Uknown uid %s\n", s);
+ kfree(s);
+ goto free_and_return;
+ }
}
kfree(s);
@@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
list_add(&v9ses->slist, &v9fs_sessionlist);
spin_unlock(&v9fs_sessionlist_lock);
- v9ses->uid = ~0;
+ v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags &= ~V9FS_ACCESS_MASK;
v9ses->flags |= V9FS_ACCESS_ANY;
- v9ses->uid = ~0;
+ v9ses->uid = INVALID_UID;
}
if (!v9fs_proto_dotl(v9ses) ||
!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
@@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->flags &= ~V9FS_ACL_MASK;
}
- fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
+ fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
v9ses->aname);
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
@@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
fid->uid = v9ses->uid;
else
- fid->uid = ~0;
+ fid->uid = INVALID_UID;
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 34c59f14a1c..a8e127c8962 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,9 +109,9 @@ struct v9fs_session_info {
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
unsigned int maxdata; /* max data for client interface */
- unsigned int dfltuid; /* default uid/muid for legacy support */
- unsigned int dfltgid; /* default gid for legacy support */
- u32 uid; /* if ACCESS_SINGLE, the uid that has access */
+ kuid_t dfltuid; /* default uid/muid for legacy support */
+ kgid_t dfltgid; /* default gid for legacy support */
+ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct backing_dev_info bdi;
@@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
#define V9FS_PORT 564
#define V9FS_DEFUSER "nobody"
#define V9FS_DEFANAME ""
-#define V9FS_DEFUID (-2)
-#define V9FS_DEFGID (-2)
+#define V9FS_DEFUID KUIDT_INIT(-2)
+#define V9FS_DEFGID KGIDT_INIT(-2)
static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ff911e77965..be1e34adc3c 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -52,10 +52,9 @@
*/
struct p9_rdir {
- struct mutex mutex;
int head;
int tail;
- uint8_t *buf;
+ uint8_t buf[];
};
/**
@@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf)
*
*/
-static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
- struct p9_rdir *rdir;
- struct p9_fid *fid;
- int err = 0;
-
- fid = filp->private_data;
- if (!fid->rdir) {
- rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
-
- if (rdir == NULL) {
- err = -ENOMEM;
- goto exit;
- }
- spin_lock(&filp->f_dentry->d_lock);
- if (!fid->rdir) {
- rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
- mutex_init(&rdir->mutex);
- rdir->head = rdir->tail = 0;
- fid->rdir = (void *) rdir;
- rdir = NULL;
- }
- spin_unlock(&filp->f_dentry->d_lock);
- kfree(rdir);
- }
-exit:
- return err;
+ struct p9_fid *fid = filp->private_data;
+ if (!fid->rdir)
+ fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+ return fid->rdir;
}
/**
@@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
buflen = fid->clnt->msize - P9_IOHDRSZ;
- err = v9fs_alloc_rdir_buf(filp, buflen);
- if (err)
- goto exit;
- rdir = (struct p9_rdir *) fid->rdir;
+ rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ if (!rdir)
+ return -ENOMEM;
- err = mutex_lock_interruptible(&rdir->mutex);
- if (err)
- return err;
- while (err == 0) {
+ while (1) {
if (rdir->tail == rdir->head) {
err = v9fs_file_readn(filp, rdir->buf, NULL,
buflen, filp->f_pos);
if (err <= 0)
- goto unlock_and_exit;
+ return err;
rdir->head = 0;
rdir->tail = err;
@@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
rdir->tail - rdir->head, &st);
if (err) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- err = -EIO;
p9stat_free(&st);
- goto unlock_and_exit;
+ return -EIO;
}
reclen = st.size+2;
@@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
p9stat_free(&st);
- if (over) {
- err = 0;
- goto unlock_and_exit;
- }
+ if (over)
+ return 0;
+
rdir->head += reclen;
filp->f_pos += reclen;
}
}
-
-unlock_and_exit:
- mutex_unlock(&rdir->mutex);
-exit:
- return err;
}
/**
@@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
- err = v9fs_alloc_rdir_buf(filp, buflen);
- if (err)
- goto exit;
- rdir = (struct p9_rdir *) fid->rdir;
+ rdir = v9fs_alloc_rdir_buf(filp, buflen);
+ if (!rdir)
+ return -ENOMEM;
- err = mutex_lock_interruptible(&rdir->mutex);
- if (err)
- return err;
-
- while (err == 0) {
+ while (1) {
if (rdir->tail == rdir->head) {
err = p9_client_readdir(fid, rdir->buf, buflen,
filp->f_pos);
if (err <= 0)
- goto unlock_and_exit;
+ return err;
rdir->head = 0;
rdir->tail = err;
@@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
&curdirent);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- err = -EIO;
- goto unlock_and_exit;
+ return -EIO;
}
/* d_off in dirent structure tracks the offset into
@@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
curdirent.d_type);
oldoffset = curdirent.d_off;
- if (over) {
- err = 0;
- goto unlock_and_exit;
- }
+ if (over)
+ return 0;
filp->f_pos = curdirent.d_off;
rdir->head += err;
}
}
-
-unlock_and_exit:
- mutex_unlock(&rdir->mutex);
-exit:
- return err;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3356e3ed511..d384a8b77ee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)
p9_client_clunk(fid);
return err;
}
- if (file->f_flags & O_TRUNC) {
- i_size_write(inode, 0);
- inode->i_blocks = 0;
- }
if ((file->f_flags & O_APPEND) &&
(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
@@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
if (page->mapping != inode->i_mapping)
goto out_unlock;
+ wait_for_stable_page(page);
return VM_FAULT_LOCKED;
out_unlock:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 890bed538f9..b5340c829de 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)
break;
}
- if (uflags & O_TRUNC)
- ret |= P9_OTRUNC;
-
if (extended) {
if (uflags & O_EXCL)
ret |= P9_OEXCL;
@@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
wstat->uid = NULL;
wstat->gid = NULL;
wstat->muid = NULL;
- wstat->n_uid = ~0;
- wstat->n_gid = ~0;
- wstat->n_muid = ~0;
+ wstat->n_uid = INVALID_UID;
+ wstat->n_gid = INVALID_GID;
+ wstat->n_muid = INVALID_UID;
wstat->extension = NULL;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 7c295588150..61e4fa70a6f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
* group of the new file system object.
*/
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
{
BUG_ON(dir_inode == NULL);
@@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags)
{ O_CREAT, P9_DOTL_CREATE },
{ O_EXCL, P9_DOTL_EXCL },
{ O_NOCTTY, P9_DOTL_NOCTTY },
- { O_TRUNC, P9_DOTL_TRUNC },
{ O_APPEND, P9_DOTL_APPEND },
{ O_NONBLOCK, P9_DOTL_NONBLOCK },
{ O_DSYNC, P9_DOTL_DSYNC },
@@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
int *opened)
{
int err = 0;
- gid_t gid;
+ kgid_t gid;
umode_t mode;
char *name = NULL;
struct p9_qid qid;
@@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
}
/* Only creates */
- if (!(flags & O_CREAT) || dentry->d_inode)
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, res);
+ else if (dentry->d_inode) {
+ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+ return -EEXIST;
+ else
+ return finish_no_open(file, res);
+ }
v9ses = v9fs_inode2v9ses(dir);
@@ -391,7 +396,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
- gid_t gid;
+ kgid_t gid;
char *name;
umode_t mode;
struct inode *inode;
@@ -693,7 +698,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
const char *symname)
{
int err;
- gid_t gid;
+ kgid_t gid;
char *name;
struct p9_qid qid;
struct inode *inode;
@@ -833,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dev_t rdev)
{
int err;
- gid_t gid;
+ kgid_t gid;
char *name;
umode_t mode;
struct v9fs_session_info *v9ses;
diff --git a/fs/Kconfig b/fs/Kconfig
index cfe512fd1ca..780725a463b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,16 +68,6 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
-config CUSE
- tristate "Character device in Userspace support"
- depends on FUSE_FS
- help
- This FUSE extension allows character devices to be
- implemented in userspace.
-
- If you want to develop or use userspace character device
- based on CUSE, answer Y or M.
-
config GENERIC_ACL
bool
select FS_POSIX_ACL
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a7460..c5a7787dd5e 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,6 @@
config ADFS_FS
- tristate "ADFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "ADFS file system support"
+ depends on BLOCK
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index cfad9afb476..a04d9e848d0 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -1,6 +1,6 @@
config AFFS_FS
- tristate "Amiga FFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "Amiga FFS file system support"
+ depends on BLOCK
help
The Fast File System (FFS) is the common file system used on hard
disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 8f975f25b48..ebba3b18e5d 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -1,6 +1,6 @@
config AFS_FS
- tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ tristate "Andrew File System support (AFS)"
+ depends on INET
select AF_RXRPC
select DNS_RESOLVER
help
@@ -22,8 +22,7 @@ config AFS_DEBUG
If unsure, say N.
config AFS_FSCACHE
- bool "Provide AFS client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Provide AFS client caching support"
depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
help
Say Y here if you want AFS data to be cached locally on disk through
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index c548aa346f0..3c462ff6db6 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -119,8 +119,8 @@ struct afs_file_status {
u64 size; /* file size */
afs_dataversion_t data_version; /* current data version */
u32 author; /* author ID */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
+ kuid_t owner; /* owner ID */
+ kgid_t group; /* group ID */
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
@@ -133,13 +133,6 @@ struct afs_file_status {
/*
* AFS file status change request
*/
-struct afs_store_status {
- u32 mask; /* which bits of the struct are set */
- u32 mtime_client; /* last time client changed data */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
- umode_t mode; /* UNIX mode */
-};
#define AFS_SET_MTIME 0x01 /* set the mtime */
#define AFS_SET_OWNER 0x02 /* set the owner ID */
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index b960ff05ea0..c2e930ec288 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
umode_t mode;
u64 data_version, size;
u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+ kuid_t owner;
+ kgid_t group;
#define EXTRACT(DST) \
do { \
@@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
size = ntohl(*bp++);
data_version = ntohl(*bp++);
EXTRACT(status->author);
- EXTRACT(status->owner);
+ owner = make_kuid(&init_user_ns, ntohl(*bp++));
+ changed |= !uid_eq(owner, status->owner);
+ status->owner = owner;
EXTRACT(status->caller_access); /* call ticket dependent */
EXTRACT(status->anon_access);
EXTRACT(status->mode);
@@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
bp++; /* seg size */
status->mtime_client = ntohl(*bp++);
status->mtime_server = ntohl(*bp++);
- EXTRACT(status->group);
+ group = make_kgid(&init_user_ns, ntohl(*bp++));
+ changed |= !gid_eq(group, status->group);
+ status->group = group;
bp++; /* sync counter */
data_version |= (u64) ntohl(*bp++) << 32;
EXTRACT(status->lock_count);
@@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
if (attr->ia_valid & ATTR_UID) {
mask |= AFS_SET_OWNER;
- owner = attr->ia_uid;
+ owner = from_kuid(&init_user_ns, attr->ia_uid);
}
if (attr->ia_valid & ATTR_GID) {
mask |= AFS_SET_GROUP;
- group = attr->ia_gid;
+ group = from_kgid(&init_user_ns, attr->ia_gid);
}
if (attr->ia_valid & ATTR_MODE) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 95cffd38239..789bc253b5f 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
set_nlink(inode, vnode->status.nlink);
inode->i_uid = vnode->status.owner;
- inode->i_gid = 0;
+ inode->i_gid = GLOBAL_ROOT_GID;
inode->i_size = vnode->status.size;
inode->i_ctime.tv_sec = vnode->status.mtime_server;
inode->i_ctime.tv_nsec = 0;
@@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_op = &afs_autocell_inode_operations;
set_nlink(inode, 2);
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
inode->i_ctime.tv_sec = get_seconds();
inode->i_ctime.tv_nsec = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 43165009428..7c31ec39957 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -24,6 +24,8 @@
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
#include "internal.h"
#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
@@ -363,6 +365,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
memset(&params, 0, sizeof(params));
+ ret = -EINVAL;
+ if (current->nsproxy->net_ns != &init_net)
+ goto error;
+
/* parse the options and device name */
if (options) {
ret = afs_parse_options(&params, options, &dev_name);
diff --git a/fs/aio.c b/fs/aio.c
index 71f613cf4a8..064bfbe3756 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
struct aio_ring *ring;
struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
- unsigned long size;
+ unsigned long size, populate;
int nr_pages;
/* Compensate for the ring buffer's head/tail overlap entry */
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
down_write(&ctx->mm->mmap_sem);
info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0);
+ MAP_ANONYMOUS|MAP_PRIVATE, 0,
+ &populate);
if (IS_ERR((void *)info->mmap_base)) {
up_write(&ctx->mm->mmap_sem);
info->mmap_size = 0;
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
aio_free_ring(ctx);
return -EAGAIN;
}
+ if (populate)
+ mm_populate(info->mmap_base, populate);
ctx->user_id = info->mmap_base;
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 7835d30f211..edc5cc2aefa 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -1,6 +1,6 @@
config BEFS_FS
- tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "BeOS file system (BeFS) support (read only)"
+ depends on BLOCK
select NLS
help
The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index c2336c62024..3728a6479c6 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -1,6 +1,6 @@
config BFS_FS
- tristate "BFS file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "BFS file system support"
+ depends on BLOCK
help
Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 11e078a747a..a5702d74d2b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -33,6 +33,7 @@
#include <linux/elf.h>
#include <linux/utsname.h>
#include <linux/coredump.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -1248,7 +1249,7 @@ static int writenote(struct memelfnote *men, struct file *file,
#undef DUMP_WRITE
static void fill_elf_header(struct elfhdr *elf, int segs,
- u16 machine, u32 flags, u8 osabi)
+ u16 machine, u32 flags)
{
memset(elf, 0, sizeof(*elf));
@@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
- cputime_to_timeval(p->utime, &prstatus->pr_utime);
- cputime_to_timeval(p->stime, &prstatus->pr_stime);
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+ cputime_to_timeval(utime, &prstatus->pr_utime);
+ cputime_to_timeval(stime, &prstatus->pr_stime);
}
cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1630,7 +1634,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Initialize the ELF file header.
*/
fill_elf_header(elf, phdrs,
- view->e_machine, view->e_flags, view->ei_osabi);
+ view->e_machine, view->e_flags);
/*
* Allocate a structure for each thread.
@@ -1870,7 +1874,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
elf_core_copy_regs(&info->prstatus->pr_reg, regs);
/* Set up header */
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
/*
* Set up the notes in similar form to SVR4 core dumps made
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 30de01ca3ee..9c13e023e2b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
- cputime_to_timeval(p->utime, &prstatus->pr_utime);
- cputime_to_timeval(p->stime, &prstatus->pr_stime);
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+ cputime_to_timeval(utime, &prstatus->pr_utime);
+ cputime_to_timeval(stime, &prstatus->pr_stime);
}
cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d6bdfc6b7b..53f5fae5cfb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk)
mutex_lock(&bdev->bd_mutex);
check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b6..ccd25ba7a9a 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,5 @@
config BTRFS_FS
- tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
- depends on EXPERIMENTAL
+ tristate "Btrfs filesystem Unstable disk format"
select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f..1e59ed575cc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3997,7 +3997,7 @@ again:
* We make the other tasks wait for the flush only when we can flush
* all things.
*/
- if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
@@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
unsigned nr_extents = 0;
int extra_reserve = 0;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
- int ret;
+ int ret = 0;
bool delalloc_lock = true;
/* If we are a free space inode we need to not flush since we will be in
@@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
- if (root->fs_info->quota_enabled) {
+ if (root->fs_info->quota_enabled)
ret = btrfs_qgroup_reserve(root, num_bytes +
nr_extents * root->leafsize);
- if (ret) {
- spin_lock(&BTRFS_I(inode)->lock);
- calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- if (delalloc_lock)
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
- }
- }
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ /*
+ * ret != 0 here means the qgroup reservation failed, we go straight to
+ * the shared error handling then.
+ */
+ if (ret == 0)
+ ret = reserve_metadata_bytes(root, block_rsv,
+ to_reserve, flush);
+
if (ret) {
u64 to_free = 0;
unsigned dropped;
@@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int empty_cluster = 2 * 1024 * 1024;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -6524,7 +6522,7 @@ reada:
}
/*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
*
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
@@ -6599,7 +6597,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block pointed to. if the block
@@ -6737,7 +6735,7 @@ skip:
}
/*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
* reference count on the block.
@@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7..fdb7a8db3b5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
return 0;
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- list_move(&em->list, &tree->modified_extents);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
@@ -280,6 +285,13 @@ out:
}
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ if (em->in_tree)
+ try_merge_map(tree, em);
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e..c6598c89cff 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef4235..94aa53b3872 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- if (!contig && (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset)) {
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4118e0b6e33..4b241fe9d2f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
+ int index;
+ int ret;
/* get the inode */
key.objectid = defrag->root;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(inode_root)) {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return PTR_ERR(inode_root);
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+ if (btrfs_root_refs(&inode_root->root_item) == 0) {
+ ret = -ENOENT;
+ goto cleanup;
}
key.objectid = defrag->ino;
@@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode)) {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return PTR_ERR(inode);
+ ret = PTR_ERR(inode);
+ goto cleanup;
}
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
iput(inode);
return 0;
+cleanup:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
}
/*
@@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (err < 0 && num_written > 0)
num_written = err;
}
-out:
+
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
@@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
+ lockend--;
len = lockend - lockstart + 1;
len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
}
}
- *offset = start;
- free_extent_map(em);
- break;
+ if (!test_bit(EXTENT_FLAG_PREALLOC,
+ &em->flags)) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
}
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c..0be7a8742a4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- int ret = 0;
+ int ret;
+ bool re_search = false;
spin_lock(&ctl->tree_lock);
again:
+ ret = 0;
if (!bytes)
goto out_lock;
@@ -1879,17 +1881,17 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- /* the tree logging code might be calling us before we
- * have fully loaded the free space rbtree for this
- * block group. So it is possible the entry won't
- * be in the rbtree yet at all. The caching code
- * will make sure not to put it in the rbtree if
- * the logging code has pinned it.
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
*/
+ WARN_ON(re_search);
goto out_lock;
}
}
+ re_search = false;
if (!info->bitmap) {
unlink_free_space(ctl, info);
if (offset == info->offset) {
@@ -1935,8 +1937,10 @@ again:
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ re_search = true;
goto again;
+ }
BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02d946a61dd..55c07b65037 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
};
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
@@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
+
+ /* 1 for the orphan item deletion. */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+
ret = btrfs_truncate(inode);
} else {
nr_unlink++;
@@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
block_end - cur_offset, 0);
if (IS_ERR(em)) {
err = PTR_ERR(em);
+ em = NULL;
break;
}
last_byte = min(extent_map_end(em), block_end);
@@ -3748,16 +3761,27 @@ next:
return err;
}
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
int ret;
if (newsize == oldsize)
return 0;
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+ inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+
if (newsize > oldsize) {
truncate_pagecache(inode, oldsize, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * We need to do this in case we fail at _any_ point during the
+ * actual truncate. Once we do the truncate_setsize we could
+ * invalidate pages which forces any outstanding ordered io to
+ * be instantly completed which will give us extents that need
+ * to be truncated. If we fail to get an orphan inode down we
+ * could have left over extents that were never meant to live,
+ * so we need to garuntee from this point on that everything
+ * will be consistent.
+ */
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
ret = btrfs_truncate(inode);
+ if (ret && inode->i_nlink)
+ btrfs_orphan_del(NULL, inode);
}
return ret;
@@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr->ia_size);
+ err = btrfs_setsize(inode, attr);
if (err)
return err;
}
@@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
return em;
if (em) {
/*
- * if our em maps to a hole, there might
- * actually be delalloc bytes behind it
+ * if our em maps to
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
*/
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return em;
else
hole_em = em;
@@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
em->start = range_start;
em->len = found;
@@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
/*
* 1 for the truncate slack space
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion
* 1 for updating the inode.
*/
- trans = btrfs_start_transaction(root, 4);
+ trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
@@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
min_size);
BUG_ON(ret);
- ret = btrfs_orphan_add(trans, inode);
- if (ret) {
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_orphan_del(trans, inode);
if (ret)
err = ret;
- } else if (ret && inode->i_nlink > 0) {
- /*
- * Failed to do the truncate, remove us from the in memory
- * orphan list.
- */
- ret = btrfs_orphan_del(NULL, inode);
}
if (trans) {
@@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
*/
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
- struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
struct btrfs_delalloc_work *work, *next;
struct list_head works;
+ struct list_head splice;
int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
INIT_LIST_HEAD(&works);
-
+ INIT_LIST_HEAD(&splice);
+again:
spin_lock(&root->fs_info->delalloc_lock);
- while (!list_empty(head)) {
- binode = list_entry(head->next, struct btrfs_inode,
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
+
+ list_del_init(&binode->delalloc_inodes);
+
inode = igrab(&binode->vfs_inode);
if (!inode)
- list_del_init(&binode->delalloc_inodes);
+ continue;
+
+ list_add_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
spin_unlock(&root->fs_info->delalloc_lock);
- if (inode) {
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (!work) {
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (unlikely(!work)) {
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
+
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
}
spin_unlock(&root->fs_info->delalloc_lock);
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (!list_empty(&root->fs_info->delalloc_inodes)) {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ goto again;
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
/* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
@@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
+ return 0;
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 61045adc307..c3f09f71bed 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
BUG_ON(ret);
- d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
if (async_transid) {
*async_transid = trans->transid;
@@ -525,6 +524,10 @@ fail:
}
if (err && !ret)
ret = err;
+
+ if (!ret)
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+
return ret;
}
@@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
+
device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
@@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_free;
}
- if (device->fs_devices && device->fs_devices->seeding) {
+
+ if (!device->writeable) {
printk(KERN_INFO "btrfs: resizer unable to apply on "
- "seeding device %llu\n",
+ "readonly device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_free;
@@ -1443,8 +1449,8 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
-
- /* check if subvolume may be deleted by a non-root user */
- err = btrfs_may_delete(dir, dentry, 1);
- if (err)
- goto out_dput;
}
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
@@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
struct btrfs_ioctl_defrag_range_args *range;
int ret;
- if (btrfs_root_readonly(root))
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
- ret = mnt_want_write_file(file);
- if (ret) {
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
- return ret;
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
}
switch (inode->i_mode & S_IFMT) {
@@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EINVAL;
}
out:
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
mnt_drop_write_file(file);
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
+ bool need_unlock; /* for mut. excl. ops lock */
int ret;
- int need_to_clear_lock = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
- mutex_lock(&fs_info->volume_mutex);
+again:
+ if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+ need_unlock = true;
+ goto locked;
+ }
+
+ /*
+ * mut. excl. ops lock is locked. Three possibilites:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* this is either (2) or (3) */
+ if (!atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ if (!mutex_trylock(&fs_info->volume_mutex))
+ goto again;
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !atomic_read(&fs_info->balance_running)) {
+ /* this is (3) */
+ need_unlock = false;
+ goto locked;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ goto again;
+ } else {
+ /* this is (2) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ } else {
+ /* this is (1) */
+ mutex_unlock(&fs_info->balance_mutex);
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+locked:
+ BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
- goto out;
+ goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bargs = NULL;
}
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
goto out_bargs;
}
- need_to_clear_lock = 1;
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
@@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
}
do_balance:
- ret = btrfs_balance(bctl, bargs);
/*
- * bctl is freed in __cancel_balance or in free_fs_info if
- * restriper was paused all the way until unmount
+ * Ownership of bctl and mutually_exclusive_operation_running
+ * goes to to btrfs_balance. bctl is freed in __cancel_balance,
+ * or, if restriper was paused all the way until unmount, in
+ * free_fs_info. mutually_exclusive_operation_running is
+ * cleared in __cancel_balance.
*/
+ need_unlock = false;
+
+ ret = btrfs_balance(bctl, bargs);
+
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -3513,12 +3570,12 @@ do_balance:
out_bargs:
kfree(bargs);
-out:
- if (need_to_clear_lock)
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
+out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+ if (need_unlock)
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
mnt_drop_write_file(file);
return ret;
}
@@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto drop_write;
}
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f1073129704..e5ed5672960 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
* if the disk i_size is already at the inode->i_size, or
* this ordered extent is inside the disk i_size, we're done
*/
- if (disk_i_size == i_size || offset <= disk_i_size) {
+ if (disk_i_size == i_size)
+ goto out;
+
+ /*
+ * We still need to update disk_i_size if outstanding_isize is greater
+ * than disk_i_size.
+ */
+ if (offset <= disk_i_size &&
+ (!ordered || ordered->outstanding_isize <= disk_i_size))
goto out;
- }
/*
* walk backward from this ordered extent to disk_i_size.
@@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
break;
if (test->file_offset >= i_size)
break;
- if (test->file_offset >= disk_i_size) {
+ if (entry_end(test) > disk_i_size) {
/*
* we don't update disk_i_size now, so record this
* undealt i_size. Or we will not know the real
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8..a5c85623432 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
ret = add_relation_rb(fs_info, found_key.objectid,
found_key.offset);
+ if (ret == -ENOENT) {
+ printk(KERN_WARNING
+ "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)found_key.offset);
+ ret = 0; /* ignore the error */
+ }
if (ret)
goto out;
next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
int ret = 0;
quota_root = fs_info->quota_root;
if (!quota_root)
return -EINVAL;
+ /* check if there are no relations to this qgroup */
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -EBUSY;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
ret = del_qgroup_item(trans, quota_root, qgroupid);
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
-
spin_unlock(&fs_info->qgroup_lock);
return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09ac365..17c306bf177 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3472,7 +3472,7 @@ out:
}
/*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
*/
static noinline_for_stack
int add_data_references(struct reloc_control *rc,
@@ -3566,7 +3566,7 @@ int add_data_references(struct reloc_control *rc,
}
/*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
*/
static noinline_for_stack
int find_next_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c..67783e03d12 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
int corrected = 0;
struct btrfs_key key;
struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info;
u64 end = offset + PAGE_SIZE - 1;
struct btrfs_root *local_root;
+ int srcu_index;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
- if (IS_ERR(local_root))
+
+ fs_info = fixup->root->fs_info;
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
return PTR_ERR(local_root);
+ }
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
- inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
}
if (PageUptodate(page)) {
- struct btrfs_fs_info *fs_info;
if (PageDirty(page)) {
/*
* we need to write the data to the defect sector. the
@@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
u64 physical_for_dev_replace;
u64 len;
struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ int srcu_index;
key.objectid = root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
local_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(local_root))
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
return PTR_ERR(local_root);
+ }
key.type = BTRFS_INODE_ITEM_KEY;
key.objectid = inum;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f80df6b0464..f4ab7a9260e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
(unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
- if (!nce_head)
+ if (!nce_head) {
+ kfree(nce);
return -ENOMEM;
+ }
INIT_LIST_HEAD(nce_head);
ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86..d8982e9601d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
function, line, errstr);
return;
}
- trans->transaction->aborted = errno;
+ ACCESS_ONCE(trans->transaction->aborted) = errno;
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea..4c0067c4f76 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -112,7 +112,6 @@ loop:
* to redo the trans_no_join checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- cur_trans = fs_info->running_transaction;
goto loop;
} else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
spin_unlock(&fs_info->trans_lock);
@@ -333,12 +332,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
&root->fs_info->trans_block_rsv,
num_bytes, flush);
if (ret)
- return ERR_PTR(ret);
+ goto reserve_fail;
}
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
- if (!h)
- return ERR_PTR(-ENOMEM);
+ if (!h) {
+ ret = -ENOMEM;
+ goto alloc_fail;
+ }
/*
* If we are JOIN_NOLOCK we're already committing a transaction and
@@ -365,11 +366,7 @@ again:
if (ret < 0) {
/* We must get the transaction if we are JOIN_NOLOCK. */
BUG_ON(type == TRANS_JOIN_NOLOCK);
-
- if (type < TRANS_JOIN_NOLOCK)
- sb_end_intwrite(root->fs_info->sb);
- kmem_cache_free(btrfs_trans_handle_cachep, h);
- return ERR_PTR(ret);
+ goto join_fail;
}
cur_trans = root->fs_info->running_transaction;
@@ -410,6 +407,19 @@ got_it:
if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
return h;
+
+join_fail:
+ if (type < TRANS_JOIN_NOLOCK)
+ sb_end_intwrite(root->fs_info->sb);
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+ if (num_bytes)
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ num_bytes);
+reserve_fail:
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
+ return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
@@ -1468,7 +1478,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
}
- if (cur_trans->aborted) {
+ /* Stop the commit early if ->aborted is set */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
goto cleanup_transaction;
}
@@ -1574,6 +1585,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
+ /* ->aborted might be set after the previous check, so check it */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ goto cleanup_transaction;
+ }
/*
* the reloc mutex makes sure that we stop
* the balancing code from coming in and moving
@@ -1657,6 +1673,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
}
+ /*
+ * The tasks which save the space cache and inode cache may also
+ * update ->aborted, check it.
+ */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
+
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d..9027bb1e746 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (skip_csum)
return 0;
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = block_len;
+ }
+
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
em->block_start + csum_offset,
@@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
em = list_entry(extents.next, struct extent_map, list);
list_del_init(&em->list);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
/*
* If we had an error we just need to delete everybody from our
* private list.
*/
if (ret) {
+ clear_em_logging(tree, em);
free_extent_map(em);
continue;
}
@@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
write_unlock(&tree->lock);
ret = log_one_extent(trans, inode, root, em, path);
- free_extent_map(em);
write_lock(&tree->lock);
+ clear_em_logging(tree, em);
+ free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa7401..5cbb7f4b167 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
} else {
ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_READ | FMODE_EXCL,
+ FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder, 0,
&bdev, &bh);
if (ret)
@@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
ret = 0;
/* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ if (bdev)
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
error_brelse:
brelse(bh);
@@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (bargs->usage == 0)
+ user_thresh = 0;
+ else if (bargs->usage > 100)
+ user_thresh = cache->key.offset;
+ else
+ user_thresh = div_factor_fine(cache->key.offset,
+ bargs->usage);
+
if (chunk_used < user_thresh)
ret = 0;
@@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
BUG_ON(ret);
+
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
__cancel_balance(fs_info);
- else
+ else {
kfree(bctl);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ }
return ret;
}
@@ -3156,7 +3168,6 @@ static int balance_kthread(void *data)
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
@@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
if (IS_ERR(tsk))
return PTR_ERR(tsk);
@@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+ WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
{ 1, 1, 2, 2, 2, 2 /* raid1 */ },
{ 1, 2, 1, 1, 1, 2 /* dup */ },
{ 1, 1, 0, 2, 1, 1 /* raid0 */ },
- { 1, 1, 0, 1, 1, 1 /* single */ },
+ { 1, 1, 1, 1, 1, 1 /* single */ },
};
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
diff --git a/fs/buffer.c b/fs/buffer.c
index b8a8b4d64d8..8e18281b407 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2359,7 +2359,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret < 0))
goto out_unlock;
set_page_dirty(page);
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
return 0;
out_unlock:
unlock_page(page);
@@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
-static int max_buffer_heads;
+static unsigned long max_buffer_heads;
int buffer_heads_over_limit;
@@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read);
void __init buffer_init(void)
{
- int nrpages;
+ unsigned long nrpages;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 9eb134ea6eb..49bc78243db 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,6 +1,6 @@
config CEPH_FS
- tristate "Ceph distributed file system (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ tristate "Ceph distributed file system"
+ depends on INET
select CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb30c1b..ae2be696eb5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -930,7 +930,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
- uid_t uid, gid_t gid, umode_t mode,
+ kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
u64 follows)
@@ -974,8 +974,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->atime, atime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq);
- fc->uid = cpu_to_le32(uid);
- fc->gid = cpu_to_le32(gid);
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
fc->mode = cpu_to_le32(mode);
fc->xattr_version = cpu_to_le64(xattr_version);
@@ -1081,8 +1081,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct timespec mtime, atime;
int wake = 0;
umode_t mode;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
struct ceph_mds_session *session;
u64 xattr_version = 0;
struct ceph_buffer *xattr_blob = NULL;
@@ -2359,10 +2359,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
- inode->i_uid = le32_to_cpu(grant->uid);
- inode->i_gid = le32_to_cpu(grant->gid);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4bc086a7247..851814d951c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -612,10 +612,11 @@ static int fill_inode(struct inode *inode,
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
- inode->i_uid = le32_to_cpu(info->uid);
- inode->i_gid = le32_to_cpu(info->gid);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -1601,26 +1602,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_UID) {
dout("setattr %p uid %d -> %d\n", inode,
- inode->i_uid, attr->ia_uid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_uid = attr->ia_uid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_uid != inode->i_uid) {
- req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+ !uid_eq(attr->ia_uid, inode->i_uid)) {
+ req->r_args.setattr.uid = cpu_to_le32(
+ from_kuid(&init_user_ns, attr->ia_uid));
mask |= CEPH_SETATTR_UID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_GID) {
dout("setattr %p gid %d -> %d\n", inode,
- inode->i_gid, attr->ia_gid);
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
if (issued & CEPH_CAP_AUTH_EXCL) {
inode->i_gid = attr->ia_gid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_gid != inode->i_gid) {
- req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+ !gid_eq(attr->ia_gid, inode->i_gid)) {
+ req->r_args.setattr.gid = cpu_to_le32(
+ from_kgid(&init_user_ns, attr->ia_gid));
mask |= CEPH_SETATTR_GID;
release |= CEPH_CAP_AUTH_SHARED;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8309e..7a3dfe0a9a8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1658,8 +1658,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(req->r_uid);
- head->caller_gid = cpu_to_le32(req->r_gid);
+ head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+ head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846dd71..ff4188bf619 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -184,8 +184,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- uid_t r_uid;
- gid_t r_gid;
+ kuid_t r_uid;
+ kgid_t r_gid;
/* for choosing which mds to send this request to */
int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 66ebe720e40..f053bbd1886 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -138,8 +138,8 @@ struct ceph_cap_snap {
struct ceph_snap_context *context;
umode_t mode;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
struct ceph_buffer *xattr_blob;
u64 xattr_version;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 21ff76c22a1..2906ee27640 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL
points. If unsure, say N.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
- depends on CIFS && EXPERIMENTAL && BROKEN
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
help
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB2
- bool "SMB2 network file system support (EXPERIMENTAL)"
- depends on CIFS && EXPERIMENTAL && INET
+ bool "SMB2 network file system support"
+ depends on CIFS && INET
select NLS
select KEYS
select FSCACHE
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bf..210fce2df30 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
compose_mount_options_err:
kfree(mountdata);
mountdata = ERR_PTR(rc);
+ kfree(*devname);
+ *devname = NULL;
goto compose_mount_options_out;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c865bfdfe81..37e4a72a7d1 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -55,10 +55,10 @@ struct cifs_sb_info {
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
atomic_t active;
- uid_t mnt_uid;
- gid_t mnt_gid;
- uid_t mnt_backupuid;
- gid_t mnt_backupgid;
+ kuid_t mnt_uid;
+ kgid_t mnt_gid;
+ kuid_t mnt_backupuid;
+ kgid_t mnt_backupgid;
umode_t mnt_file_mode;
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 086f381d648..10e77476129 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -149,10 +149,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
goto out;
dp = description + strlen(description);
- sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+ sprintf(dp, ";uid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
dp = description + strlen(description);
- sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+ sprintf(dp, ";creduid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
if (sesInfo->user_name) {
dp = description + strlen(description);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 5cbd00e7406..f1e3f25fe00 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -266,8 +266,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct key *sidkey;
char *sidstr;
const struct cred *saved_cred;
- uid_t fuid = cifs_sb->mnt_uid;
- gid_t fgid = cifs_sb->mnt_gid;
+ kuid_t fuid = cifs_sb->mnt_uid;
+ kgid_t fgid = cifs_sb->mnt_gid;
/*
* If we have too many subauthorities, then something is really wrong.
@@ -297,6 +297,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
* probably a safe assumption but might be better to check based on
* sidtype.
*/
+ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
if (sidkey->datalen != sizeof(uid_t)) {
rc = -EIO;
cFYI(1, "%s: Downcall contained malformed key "
@@ -305,10 +306,21 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
goto out_key_put;
}
- if (sidtype == SIDOWNER)
- memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
- else
- memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+ if (sidtype == SIDOWNER) {
+ kuid_t uid;
+ uid_t id;
+ memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+ uid = make_kuid(&init_user_ns, id);
+ if (uid_valid(uid))
+ fuid = uid;
+ } else {
+ kgid_t gid;
+ gid_t id;
+ memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+ gid = make_kgid(&init_user_ns, id);
+ if (gid_valid(gid))
+ fgid = gid;
+ }
out_key_put:
key_put(sidkey);
@@ -346,7 +358,8 @@ init_cifs_idmap(void)
if (!cred)
return -ENOMEM;
- keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
+ keyring = keyring_alloc(".cifs_idmap",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
(KEY_POS_ALL & ~KEY_POS_SETATTR) |
KEY_USR_VIEW | KEY_USR_READ,
KEY_ALLOC_NOT_IN_QUOTA, NULL);
@@ -774,7 +787,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
- __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
+ __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
{
int rc = 0;
__u32 dacloffset;
@@ -806,17 +819,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
*aclflag = CIFS_ACL_DACL;
} else {
memcpy(pnntsd, pntsd, secdesclen);
- if (uid != NO_CHANGE_32) { /* chown */
+ if (uid_valid(uid)) { /* chown */
+ uid_t id;
owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
le32_to_cpu(pnntsd->osidoffset));
nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!nowner_sid_ptr)
return -ENOMEM;
- rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+ id = from_kuid(&init_user_ns, uid);
+ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
if (rc) {
cFYI(1, "%s: Mapping error %d for owner id %d",
- __func__, rc, uid);
+ __func__, rc, id);
kfree(nowner_sid_ptr);
return rc;
}
@@ -824,17 +839,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
kfree(nowner_sid_ptr);
*aclflag = CIFS_ACL_OWNER;
}
- if (gid != NO_CHANGE_32) { /* chgrp */
+ if (gid_valid(gid)) { /* chgrp */
+ gid_t id;
group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
le32_to_cpu(pnntsd->gsidoffset));
ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr)
return -ENOMEM;
- rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+ id = from_kgid(&init_user_ns, gid);
+ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
if (rc) {
cFYI(1, "%s: Mapping error %d for group id %d",
- __func__, rc, gid);
+ __func__, rc, id);
kfree(ngroup_sid_ptr);
return rc;
}
@@ -1002,7 +1019,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
/* Convert mode bits to an ACL so we can update the ACL on the server */
int
id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
- uid_t uid, gid_t gid)
+ kuid_t uid, kgid_t gid)
{
int rc = 0;
int aclflag = CIFS_ACL_DACL; /* default flag to set */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8b35365c70b..4bad7b16271 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -375,13 +375,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
(int)(srcaddr->sa_family));
}
- seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
+ seq_printf(s, ",uid=%u",
+ from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
seq_printf(s, ",forceuid");
else
seq_printf(s, ",noforceuid");
- seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
+ seq_printf(s, ",gid=%u",
+ from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
seq_printf(s, ",forcegid");
else
@@ -436,9 +438,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
seq_printf(s, ",noperm");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
- seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
+ seq_printf(s, ",backupuid=%u",
+ from_kuid_munged(&init_user_ns,
+ cifs_sb->mnt_backupuid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
- seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
+ seq_printf(s, ",backupgid=%u",
+ from_kgid_munged(&init_user_ns,
+ cifs_sb->mnt_backupgid));
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e6899cea1c3..4f07f6fbe49 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -400,11 +400,11 @@ struct smb_vol {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
- uid_t cred_uid;
- uid_t linux_uid;
- gid_t linux_gid;
- uid_t backupuid;
- gid_t backupgid;
+ kuid_t cred_uid;
+ kuid_t linux_uid;
+ kgid_t linux_gid;
+ kuid_t backupuid;
+ kgid_t backupgid;
umode_t file_mode;
umode_t dir_mode;
unsigned secFlg;
@@ -703,8 +703,8 @@ struct cifs_ses {
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
__u64 Suid; /* remote smb uid */
- uid_t linux_uid; /* overriding owner of files on the mount */
- uid_t cred_uid; /* owner of credentials */
+ kuid_t linux_uid; /* overriding owner of files on the mount */
+ kuid_t cred_uid; /* owner of credentials */
unsigned int capabilities;
char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
TCP names - will ipv6 and sctp addresses fit? */
@@ -838,7 +838,7 @@ struct cifs_tcon {
*/
struct tcon_link {
struct rb_node tl_rbnode;
- uid_t tl_uid;
+ kuid_t tl_uid;
unsigned long tl_flags;
#define TCON_LINK_MASTER 0
#define TCON_LINK_PENDING 1
@@ -931,7 +931,7 @@ struct cifsFileInfo {
struct list_head tlist; /* pointer to next fid owned by tcon */
struct list_head flist; /* next fid (file instance) for this inode */
struct cifs_fid_locks *llist; /* brlocks held by this fid */
- unsigned int uid; /* allows finding which FileInfo structure */
+ kuid_t uid; /* allows finding which FileInfo structure */
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
/* BB add lock scope info here if needed */ ;
@@ -1245,8 +1245,8 @@ struct cifs_fattr {
u64 cf_eof;
u64 cf_bytes;
u64 cf_createtime;
- uid_t cf_uid;
- gid_t cf_gid;
+ kuid_t cf_uid;
+ kgid_t cf_gid;
umode_t cf_mode;
dev_t cf_rdev;
unsigned int cf_nlink;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b9d59a948a2..e996ff6b26d 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -277,7 +277,6 @@
#define CIFS_NO_HANDLE 0xFFFF
#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL
-#define NO_CHANGE_32 0xFFFFFFFFUL
/* IPC$ in ASCII */
#define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1988c1baa22..f450f0683dd 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -46,7 +46,8 @@ extern void _free_xid(unsigned int);
({ \
unsigned int __xid = _get_xid(); \
cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \
- __func__, __xid, current_fsuid()); \
+ __func__, __xid, \
+ from_kuid(&init_user_ns, current_fsuid())); \
__xid; \
})
@@ -161,7 +162,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
const char *path, const __u16 *pfid);
extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
- uid_t, gid_t);
+ kuid_t, kgid_t);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
@@ -304,8 +305,8 @@ struct cifs_unix_set_info_args {
__u64 atime;
__u64 mtime;
__u64 mode;
- __u64 uid;
- __u64 gid;
+ kuid_t uid;
+ kgid_t gid;
dev_t device;
};
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 76d0d299885..00e12f2d626 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5819,8 +5819,14 @@ static void
cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
const struct cifs_unix_set_info_args *args)
{
+ u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;
u64 mode = args->mode;
+ if (uid_valid(args->uid))
+ uid = from_kuid(&init_user_ns, args->uid);
+ if (gid_valid(args->gid))
+ gid = from_kgid(&init_user_ns, args->gid);
+
/*
* Samba server ignores set of file size to zero due to bugs in some
* older clients, but we should be precise - we use SetFileSize to
@@ -5833,8 +5839,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
data_offset->LastStatusChange = cpu_to_le64(args->ctime);
data_offset->LastAccessTime = cpu_to_le64(args->atime);
data_offset->LastModificationTime = cpu_to_le64(args->mtime);
- data_offset->Uid = cpu_to_le64(args->uid);
- data_offset->Gid = cpu_to_le64(args->gid);
+ data_offset->Uid = cpu_to_le64(uid);
+ data_offset->Gid = cpu_to_le64(gid);
/* better to leave device as zero when it is */
data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 17c3643e595..4474a57f30a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -987,6 +987,41 @@ static int get_option_ul(substring_t args[], unsigned long *option)
return rc;
}
+static int get_option_uid(substring_t args[], kuid_t *result)
+{
+ unsigned long value;
+ kuid_t uid;
+ int rc;
+
+ rc = get_option_ul(args, &value);
+ if (rc)
+ return rc;
+
+ uid = make_kuid(current_user_ns(), value);
+ if (!uid_valid(uid))
+ return -EINVAL;
+
+ *result = uid;
+ return 0;
+}
+
+static int get_option_gid(substring_t args[], kgid_t *result)
+{
+ unsigned long value;
+ kgid_t gid;
+ int rc;
+
+ rc = get_option_ul(args, &value);
+ if (rc)
+ return rc;
+
+ gid = make_kgid(current_user_ns(), value);
+ if (!gid_valid(gid))
+ return -EINVAL;
+
+ *result = gid;
+ return 0;
+}
static int cifs_parse_security_flavors(char *value,
struct smb_vol *vol)
@@ -1424,47 +1459,42 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
/* Numeric Values */
case Opt_backupuid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->backupuid)) {
cERROR(1, "%s: Invalid backupuid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->backupuid = option;
vol->backupuid_specified = true;
break;
case Opt_backupgid:
- if (get_option_ul(args, &option)) {
+ if (get_option_gid(args, &vol->backupgid)) {
cERROR(1, "%s: Invalid backupgid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->backupgid = option;
vol->backupgid_specified = true;
break;
case Opt_uid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->linux_uid)) {
cERROR(1, "%s: Invalid uid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->linux_uid = option;
uid_specified = true;
break;
case Opt_cruid:
- if (get_option_ul(args, &option)) {
+ if (get_option_uid(args, &vol->cred_uid)) {
cERROR(1, "%s: Invalid cruid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->cred_uid = option;
break;
case Opt_gid:
- if (get_option_ul(args, &option)) {
+ if (get_option_gid(args, &vol->linux_gid)) {
cERROR(1, "%s: Invalid gid value",
__func__);
goto cifs_parse_mount_err;
}
- vol->linux_gid = option;
gid_specified = true;
break;
case Opt_file_mode:
@@ -1917,7 +1947,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
}
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
- struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
}
default:
@@ -2241,7 +2271,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
{
switch (ses->server->secType) {
case Kerberos:
- if (vol->cred_uid != ses->cred_uid)
+ if (!uid_eq(vol->cred_uid, ses->cred_uid))
return 0;
break;
default:
@@ -2713,7 +2743,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
if (new->rsize && new->rsize < old->rsize)
return 0;
- if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+ if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))
return 0;
if (old->mnt_file_mode != new->mnt_file_mode ||
@@ -3919,7 +3949,7 @@ cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
}
static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
{
int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
@@ -3989,7 +4019,7 @@ cifs_sb_tcon_pending_wait(void *unused)
/* find and return a tlink with given uid */
static struct tcon_link *
-tlink_rb_search(struct rb_root *root, uid_t uid)
+tlink_rb_search(struct rb_root *root, kuid_t uid)
{
struct rb_node *node = root->rb_node;
struct tcon_link *tlink;
@@ -3997,9 +4027,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid)
while (node) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
- if (tlink->tl_uid > uid)
+ if (uid_gt(tlink->tl_uid, uid))
node = node->rb_left;
- else if (tlink->tl_uid < uid)
+ else if (uid_lt(tlink->tl_uid, uid))
node = node->rb_right;
else
return tlink;
@@ -4018,7 +4048,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
parent = *new;
- if (tlink->tl_uid > new_tlink->tl_uid)
+ if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))
new = &((*new)->rb_left);
else
new = &((*new)->rb_right);
@@ -4048,7 +4078,7 @@ struct tcon_link *
cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
{
int ret;
- uid_t fsuid = current_fsuid();
+ kuid_t fsuid = current_fsuid();
struct tcon_link *tlink, *newtlink;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8719bbe0dcc..1cd01621744 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
*created |= FILE_CREATED;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64) current_fsuid();
+ args.uid = current_fsuid();
if (inode->i_mode & S_ISGID)
- args.gid = (__u64) inode->i_gid;
+ args.gid = inode->i_gid;
else
- args.gid = (__u64) current_fsgid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
current->tgid);
@@ -588,11 +588,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
.device = device_number,
};
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64) current_fsuid();
- args.gid = (__u64) current_fsgid();
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
cifs_sb->local_nls,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1a5c2911b04..c16d2a018ab 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -515,8 +515,8 @@ int cifs_open(struct inode *inode, struct file *file)
*/
struct cifs_unix_set_info_args args = {
.mode = inode->i_mode,
- .uid = NO_CHANGE_64,
- .gid = NO_CHANGE_64,
+ .uid = INVALID_UID, /* no change */
+ .gid = INVALID_GID, /* no change */
.ctime = NO_CHANGE_64,
.atime = NO_CHANGE_64,
.mtime = NO_CHANGE_64,
@@ -1693,7 +1693,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
- if (fsuid_only && open_file->uid != current_fsuid())
+ if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
if (!open_file->invalidHandle) {
@@ -1746,7 +1746,7 @@ refind_writable:
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
continue;
- if (fsuid_only && open_file->uid != current_fsuid())
+ if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1fc864b92cf..d2a833999bc 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -244,15 +244,25 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
break;
}
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
- fattr->cf_uid = cifs_sb->mnt_uid;
- else
- fattr->cf_uid = le64_to_cpu(info->Uid);
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
- fattr->cf_gid = cifs_sb->mnt_gid;
- else
- fattr->cf_gid = le64_to_cpu(info->Gid);
+ fattr->cf_uid = cifs_sb->mnt_uid;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) {
+ u64 id = le64_to_cpu(info->Uid);
+ if (id < ((uid_t)-1)) {
+ kuid_t uid = make_kuid(&init_user_ns, id);
+ if (uid_valid(uid))
+ fattr->cf_uid = uid;
+ }
+ }
+
+ fattr->cf_gid = cifs_sb->mnt_gid;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) {
+ u64 id = le64_to_cpu(info->Gid);
+ if (id < ((gid_t)-1)) {
+ kgid_t gid = make_kgid(&init_user_ns, id);
+ if (gid_valid(gid))
+ fattr->cf_gid = gid;
+ }
+ }
fattr->cf_nlink = le64_to_cpu(info->Nlinks);
}
@@ -1245,14 +1255,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
.device = 0,
};
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = (__u64)current_fsuid();
+ args.uid = current_fsuid();
if (parent->i_mode & S_ISGID)
- args.gid = (__u64)parent->i_gid;
+ args.gid = parent->i_gid;
else
- args.gid = (__u64)current_fsgid();
+ args.gid = current_fsgid();
} else {
- args.uid = NO_CHANGE_64;
- args.gid = NO_CHANGE_64;
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
}
CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
cifs_sb->local_nls,
@@ -2013,12 +2023,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if (attrs->ia_valid & ATTR_UID)
args->uid = attrs->ia_uid;
else
- args->uid = NO_CHANGE_64;
+ args->uid = INVALID_UID; /* no change */
if (attrs->ia_valid & ATTR_GID)
args->gid = attrs->ia_gid;
else
- args->gid = NO_CHANGE_64;
+ args->gid = INVALID_GID; /* no change */
if (attrs->ia_valid & ATTR_ATIME)
args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
@@ -2086,8 +2096,8 @@ static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
{
unsigned int xid;
- uid_t uid = NO_CHANGE_32;
- gid_t gid = NO_CHANGE_32;
+ kuid_t uid = INVALID_UID;
+ kgid_t gid = INVALID_GID;
struct inode *inode = direntry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,7 +2156,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+ if (uid_valid(uid) || gid_valid(gid)) {
rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
uid, gid);
if (rc) {
@@ -2170,7 +2180,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = id_mode_to_cifs_acl(inode, full_path, mode,
- NO_CHANGE_32, NO_CHANGE_32);
+ INVALID_UID, INVALID_GID);
if (rc) {
cFYI(1, "%s: Setting ACL failed with error: %d",
__func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 51dc2fb6e85..9f6c4c45d21 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
}
rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
if (rc) {
- cERROR(1, "%s: Could not update iwth link_str", __func__);
+ cERROR(1, "%s: Could not update with link_str", __func__);
goto symlink_hash_err;
}
rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3a00c0d0cea..1b15bf839f3 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,7 +569,7 @@ bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
- if (cifs_sb->mnt_backupuid == current_fsuid())
+ if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))
return true;
}
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 958ae0e0ff8..1da168c61d3 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask)
spin_lock(&cii->c_lock);
cii->c_cached_epoch = atomic_read(&permission_epoch);
- if (cii->c_uid != current_fsuid()) {
+ if (!uid_eq(cii->c_uid, current_fsuid())) {
cii->c_uid = current_fsuid();
cii->c_cached_perm = mask;
} else
@@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask)
spin_lock(&cii->c_lock);
hit = (mask & cii->c_cached_perm) == mask &&
- cii->c_uid == current_fsuid() &&
+ uid_eq(cii->c_uid, current_fsuid()) &&
cii->c_cached_epoch == atomic_read(&permission_epoch);
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index b24fdfd8a3f..c6407521321 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -25,7 +25,7 @@ struct coda_inode_info {
u_short c_flags; /* flags (see below) */
unsigned int c_mapcount; /* nr of times this inode is mapped */
unsigned int c_cached_epoch; /* epoch for cached permissions */
- vuid_t c_uid; /* fsuid for cached permissions */
+ kuid_t c_uid; /* fsuid for cached permissions */
unsigned int c_cached_perm; /* cached access permissions */
spinlock_t c_lock;
struct inode vfs_inode;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 854ace71268..2849f41e72a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_mode != (u_short) -1)
inode->i_mode = attr->va_mode | inode_type;
if (attr->va_uid != -1)
- inode->i_uid = (uid_t) attr->va_uid;
+ inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);
if (attr->va_gid != -1)
- inode->i_gid = (gid_t) attr->va_gid;
+ inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);
if (attr->va_nlink != -1)
set_nlink(inode, attr->va_nlink);
if (attr->va_size != -1)
@@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
vattr->va_mode = iattr->ia_mode;
}
if ( valid & ATTR_UID ) {
- vattr->va_uid = (vuid_t) iattr->ia_uid;
+ vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);
}
if ( valid & ATTR_GID ) {
- vattr->va_gid = (vgid_t) iattr->ia_gid;
+ vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);
}
if ( valid & ATTR_SIZE ) {
vattr->va_size = iattr->ia_size;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6df708c7b3e..dada9d0abed 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -20,6 +20,7 @@
#include <linux/file.h>
#include <linux/vfs.h>
#include <linux/slab.h>
+#include <linux/pid_namespace.h>
#include <asm/uaccess.h>
@@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
ei->c_flags = 0;
- ei->c_uid = 0;
+ ei->c_uid = GLOBAL_ROOT_UID;
ei->c_cached_perm = 0;
spin_lock_init(&ei->c_lock);
return &ei->vfs_inode;
@@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
int error;
int idx;
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
idx = get_device_index((struct coda_mount_data *) data);
/* Ignore errors in data, for backward compatibility */
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 761d5b31b18..ebc2bae6c28 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -37,6 +37,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/device.h>
+#include <linux/pid_namespace.h>
#include <asm/io.h>
#include <asm/poll.h>
#include <asm/uaccess.h>
@@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
struct venus_comm *vcp;
int idx, err;
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
+ if (current_user_ns() != &init_user_ns)
+ return -EINVAL;
+
idx = iminor(inode);
if (idx < 0 || idx >= MAX_CODADEVS)
return -ENODEV;
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 0c68fd31fbf..3a731976dc5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size)
return ERR_PTR(-ENOMEM);
inp->ih.opcode = opcode;
- inp->ih.pid = current->pid;
- inp->ih.pgid = task_pgrp_nr(current);
- inp->ih.uid = current_fsuid();
+ inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns);
+ inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns);
+ inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());
return (void*)inp;
}
@@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
}
int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
- vuid_t uid)
+ kuid_t uid)
{
union inputArgs *inp;
union outputArgs *outp;
@@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
insize = SIZE(release);
UPARG(CODA_CLOSE);
- inp->ih.uid = uid;
+ inp->ih.uid = from_kuid(&init_user_ns, uid);
inp->coda_close.VFid = *fid;
inp->coda_close.flags = flags;
diff --git a/fs/compat.c b/fs/compat.c
index 015e1e1f87c..fe40fde2911 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1278,8 +1278,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_open(const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
@@ -1288,8 +1287,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode)
* Exactly like fs/open.c:sys_openat(), except that it doesn't set the
* O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(dfd, filename, flags, mode);
}
@@ -1739,55 +1737,13 @@ asmlinkage long compat_sys_signalfd(int ufd,
}
#endif /* CONFIG_SIGNALFD */
-#ifdef CONFIG_TIMERFD
-
-asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
- const struct compat_itimerspec __user *utmr,
- struct compat_itimerspec __user *otmr)
-{
- int error;
- struct itimerspec t;
- struct itimerspec __user *ut;
-
- if (get_compat_itimerspec(&t, utmr))
- return -EFAULT;
- ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
- if (copy_to_user(&ut[0], &t, sizeof(t)))
- return -EFAULT;
- error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
- if (!error && otmr)
- error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
- put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
- return error;
-}
-
-asmlinkage long compat_sys_timerfd_gettime(int ufd,
- struct compat_itimerspec __user *otmr)
-{
- int error;
- struct itimerspec t;
- struct itimerspec __user *ut;
-
- ut = compat_alloc_user_space(sizeof(struct itimerspec));
- error = sys_timerfd_gettime(ufd, ut);
- if (!error)
- error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
- put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
- return error;
-}
-
-#endif /* CONFIG_TIMERFD */
-
#ifdef CONFIG_FHANDLE
/*
* Exactly like fs/open.c:sys_open_by_handle_at(), except that it
* doesn't set the O_LARGEFILE flag.
*/
-asmlinkage long
-compat_sys_open_by_handle_at(int mountdirfd,
- struct file_handle __user *handle, int flags)
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+ struct file_handle __user *, handle, int, flags)
{
return do_handle_open(mountdirfd, handle, flags);
}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 90d222f11e3..7aabc6ad4e9 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
static int configfs_depend_prep(struct dentry *origin,
struct config_item *target)
{
- struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+ struct configfs_dirent *child_sd, *sd;
int ret = 0;
- BUG_ON(!origin || !sd);
+ BUG_ON(!origin || !origin->d_fsdata);
+ sd = origin->d_fsdata;
if (sd->s_element == target) /* Boo-yah */
goto out;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a5f12b7e228..0c4f80b447f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -322,7 +322,6 @@ static struct dentry *__create_file(const char *name, umode_t mode,
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = NULL;
mutex_lock(&parent->d_inode->i_mutex);
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry)) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 472e6befc54..073d30b9d1a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb)
struct dentry *root = sb->s_root;
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
+ kuid_t root_uid;
+ kgid_t root_gid;
+
+ root_uid = make_kuid(current_user_ns(), 0);
+ root_gid = make_kgid(current_user_ns(), 0);
+ if (!uid_valid(root_uid) || !gid_valid(root_gid))
+ return -EINVAL;
mutex_lock(&root->d_inode->i_mutex);
@@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb)
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+ inode->i_uid = root_uid;
+ inode->i_gid = root_gid;
d_add(dentry, inode);
@@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
if (error)
return ERR_PTR(error);
+ /* Require newinstance for all user namespace mounts to ensure
+ * the mount options are not changed.
+ */
+ if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+ return ERR_PTR(-EINVAL);
+
if (opts.newinstance)
s = sget(fs_type, NULL, set_anon_super, flags, NULL);
else
@@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+#endif
};
/*
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 77c0f70f8fe..e7665c31f7b 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -96,10 +96,13 @@ do { \
}
+#define DLM_RTF_SHRINK 0x00000001
+
struct dlm_rsbtable {
struct rb_root keep;
struct rb_root toss;
spinlock_t lock;
+ uint32_t flags;
};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a579f30f237..f7501651762 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref)
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
r->res_toss_time = jiffies;
+ ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
if (r->res_lvbptr) {
dlm_free_lvb(r->res_lvbptr);
r->res_lvbptr = NULL;
@@ -1659,11 +1660,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
char *name;
int our_nodeid = dlm_our_nodeid();
int remote_count = 0;
+ int need_shrink = 0;
int i, len, rv;
memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
spin_lock(&ls->ls_rsbtbl[b].lock);
+
+ if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
+ return;
+ }
+
for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
next = rb_next(n);
r = rb_entry(n, struct dlm_rsb, res_hashnode);
@@ -1679,6 +1687,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
continue;
}
+ need_shrink = 1;
+
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.ci_toss_secs * HZ)) {
continue;
@@ -1710,6 +1720,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
dlm_free_rsb(r);
}
+
+ if (need_shrink)
+ ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
+ else
+ ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
spin_unlock(&ls->ls_rsbtbl[b].lock);
/*
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0c..911649a47dd 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,
#endif
return -EINVAL;
-#ifdef CONFIG_COMPAT
- if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
-#else
+ /*
+ * can't compare against COMPAT/dlm_write_request32 because
+ * we don't yet know if is64bit is zero
+ */
if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
-#endif
return -EINVAL;
kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index cc16562654d..e15ef38c24f 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
config ECRYPT_FS
- tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
+ tristate "eCrypt filesystem layer support"
+ depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO_ECB
select CRYPTO_CBC
select CRYPTO_MD5
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 6ebfc1c207a..d020e3c30fe 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -1,6 +1,6 @@
config EFS_FS
- tristate "EFS file system support (read only) (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ tristate "EFS file system support (read only)"
+ depends on BLOCK
help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2616d0ea5c5..9f9992b3792 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
return bh;
}
-static void release_blocks(struct super_block *sb, int count)
-{
- if (count) {
- struct ext2_sb_info *sbi = EXT2_SB(sb);
-
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
- }
-}
-
static void group_adjust_blocks(struct super_block *sb, int group_no,
struct ext2_group_desc *desc, struct buffer_head *bh, int count)
{
@@ -568,8 +559,11 @@ do_more:
}
error_return:
brelse(bitmap_bh);
- release_blocks(sb, freed);
- dquot_free_block_nodirty(inode, freed);
+ if (freed) {
+ percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+ dquot_free_block_nodirty(inode, freed);
+ mark_inode_dirty(inode);
+ }
}
/**
@@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
*errp = -ENOSPC;
sb = inode->i_sb;
- if (!sb) {
- printk("ext2_new_blocks: nonexistent device");
- return 0;
- }
/*
* Check quota for allocation of this block.
@@ -1416,9 +1406,11 @@ allocated:
*errp = 0;
brelse(bitmap_bh);
- dquot_free_block_nodirty(inode, *count-num);
- mark_inode_dirty(inode);
- *count = num;
+ if (num < *count) {
+ dquot_free_block_nodirty(inode, *count-num);
+ mark_inode_dirty(inode);
+ *count = num;
+ }
return ret_block;
io_error:
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6363ac66faf..c3881e56662 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -495,6 +495,10 @@ static int ext2_alloc_branch(struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
@@ -523,6 +527,14 @@ static int ext2_alloc_branch(struct inode *inode,
}
*blks = num;
return err;
+
+failed:
+ for (i = 1; i < n; i++)
+ bforget(branch[i].bh);
+ for (i = 0; i < indirect_blks; i++)
+ ext2_free_blocks(inode, new_blocks[i], 1);
+ ext2_free_blocks(inode, new_blocks[i], num);
+ return err;
}
/**
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fa04d023177..7f68c811402 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
bh = sb_bread(sb, tmp_bh.b_blocknr);
else
bh = sb_getblk(sb, tmp_bh.b_blocknr);
- if (!bh) {
+ if (unlikely(!bh)) {
err = -EIO;
goto out;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index b6754dbbce3..2d7557db3ae 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
ext2_free_blocks(inode, block, 1);
mark_inode_dirty(inode);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b176d425354..d512c4bc4ad 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -717,7 +721,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext3_free_blocks(handle, inode, new_blocks[i], 1);
ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
@@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
"inode=%lu, block="E3FSBLK,
inode->i_ino, block);
- return -EIO;
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 61fa09eb250..692de13e359 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,7 +36,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext3_append(handle_t *handle,
struct inode *inode,
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0f814f3450d..27105655502 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto exit_bh;
}
if ((err = ext3_journal_get_write_access(handle, gdb))) {
@@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext3_debug("update metadata backup %#04lx\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b329..5546ca225ff 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return 0;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
+ if (sbi->s_qf_names[qtype]) {
+ int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
kfree(qname);
- return 0;
+ if (!same) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ }
+ return same;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext3_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ kfree(qname);
return 0;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sbi->s_mount_opt, QUOTA);
return 1;
}
@@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {
" when quota turned on");
return 0;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
+ if (sbi->s_qf_names[qtype]) {
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ }
return 1;
}
#endif
@@ -2065,6 +2067,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ sb->s_flags |= MS_SNAP_STABLE;
return 0;
@@ -2605,7 +2608,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
/*
@@ -2698,9 +2712,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
#endif
if (enable_quota)
dquot_resume(sb, -1);
@@ -2714,9 +2726,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d22ebb7a4f5..b1fc96383e0 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -813,10 +813,10 @@ inserted:
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
getblk_failed:
ext3_free_blocks(handle, inode, block, 1);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e6e0d988439..39a54a0e9fe 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode)
if (error)
return error;
retry:
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
ext4_std_error(inode->i_sb, error);
@@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
acl = NULL;
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+ ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto release_and_out;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cf1821784a1..2f2e0da1a6b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb,
}
/**
- * ext4_read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
@@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (!bh)
+ return NULL;
if (ext4_wait_block_bitmap(sb, block_group, bh)) {
put_bh(bh);
return NULL;
@@ -482,11 +484,16 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
- root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+
+ /*
+ * r_blocks_count should always be multiple of the cluster ratio so
+ * we are safe to do a plane bit shift only.
+ */
+ root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
- free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+ free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
}
/* Check whether we have space after accounting for current
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index dc149d123de..6dda04f05ef 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp,
"at offset %llu",
(unsigned long long)filp->f_pos);
filp->f_pos += sb->s_blocksize - offset;
+ brelse(bh);
continue;
}
set_buffer_verified(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3c33a..6e16c186795 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -194,8 +194,7 @@ struct mpage_da_data {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_QUEUED 0x0004
-#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_DIRECT 0x0004
struct ext4_io_page {
struct page *p_page;
@@ -215,10 +214,8 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
- struct page *page; /* for writepage() path */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct work_struct work; /* data work queue */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
int num_io_pages; /* for writepages() */
@@ -582,6 +579,8 @@ enum {
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Do not take i_data_sem locking in ext4_map_blocks */
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
+ /* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
* Flags used by ext4_free_blocks
@@ -810,17 +809,6 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
-/*
- * storage for cached extent
- * If ec_len == 0, then the cache is invalid.
- * If ec_start == 0, then the cache represents a gap (null mapping)
- */
-struct ext4_ext_cache {
- ext4_fsblk_t ec_start;
- ext4_lblk_t ec_block;
- __u32 ec_len; /* must be 32bit to return holes */
-};
-
#include "extents_status.h"
/*
@@ -887,7 +875,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -901,6 +888,8 @@ struct ext4_inode_info {
/* extents status tree */
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
+ struct list_head i_es_lru;
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -930,6 +919,7 @@ struct ext4_inode_info {
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ struct work_struct i_unwritten_work; /* deferred extent conversion */
spinlock_t i_block_reservation_lock;
@@ -985,7 +975,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1316,6 +1305,11 @@ struct ext4_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2007,9 +2001,20 @@ extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
- const struct qstr *qstr, __u32 goal,
- uid_t *owner);
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+ __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+ 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+ type, nblocks) \
+ __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+ (type), __LINE__, (nblocks))
+
+
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -2103,6 +2108,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
extern void ext4_ind_truncate(struct inode *inode);
+extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2151,6 +2157,8 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16]);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2227,6 +2235,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed);
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
@@ -2454,6 +2464,75 @@ extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern void ext4_unwritten_wait(struct inode *inode);
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_inline_size(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern void ext4_write_inline_data(struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buffer, loff_t pos,
+ unsigned int len);
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
@@ -2520,6 +2599,9 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end);
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
@@ -2537,6 +2619,7 @@ extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 487fda12bc0..8643ff5bbeb 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void
-ext4_ext_invalidate_cache(struct inode *inode)
-{
- EXT4_I(inode)->i_cached_extent.ec_len = 0;
-}
-
static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
{
/* We can not have an uninitialized extent of zero length! */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b4323ba846b..7058975e3a5 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,6 +6,108 @@
#include <trace/events/ext4.h>
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int nblocks)
+{
+ journal_t *journal;
+
+ trace_ext4_journal_start(sb, nblocks, _RET_IP_);
+ if (sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return ERR_PTR(-EROFS);
+ }
+ return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+ ext4_put_nojournal(handle);
+ return 0;
+ }
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
+ if (err)
+ __ext4_std_error(sb, where, line, err);
+ return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn, struct buffer_head *bh,
+ handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+ caller, line, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7177f9b21cb..4c216b1bf20 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -59,12 +59,6 @@
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
- * generous. We can grow the delete transaction later if necessary. */
-
-#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
-
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
@@ -110,6 +104,36 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * so we need to reserve credits for this eventuality
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+ return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC 0
+#define EXT4_HT_INODE 1
+#define EXT4_HT_WRITE_PAGE 2
+#define EXT4_HT_MAP_BLOCKS 3
+#define EXT4_HT_DIR 4
+#define EXT4_HT_TRUNCATE 5
+#define EXT4_HT_QUOTA 6
+#define EXT4_HT_RESIZE 7
+#define EXT4_HT_MIGRATE 8
+#define EXT4_HT_MOVE_EXTENTS 9
+#define EXT4_HT_XATTR 10
+#define EXT4_HT_MAX 11
+
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
*
@@ -234,7 +258,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_super(handle, sb) \
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int nblocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -268,9 +293,17 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
return 1;
}
-static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+#define ext4_journal_start_sb(sb, type, nblocks) \
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+
+#define ext4_journal_start(inode, type, nblocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+ unsigned int line, int type,
+ int nblocks)
{
- return ext4_journal_start_sb(inode->i_sb, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
}
#define ext4_journal_stop(handle) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7817ca7c2bb..28dd8eeea6a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle,
int flags);
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex);
+ struct extent_status *newes);
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
@@ -714,7 +714,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
- ext4_ext_invalidate_cache(inode);
return 0;
}
@@ -725,6 +724,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
struct ext4_extent_header *eh;
struct buffer_head *bh;
short int depth, i, ppos = 0, alloc = 0;
+ int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -752,12 +752,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_ext = NULL;
bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto err;
+ }
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, block,
path[ppos].p_block);
- if (bh_submit_read(bh) < 0) {
+ ret = bh_submit_read(bh);
+ if (ret < 0) {
put_bh(bh);
goto err;
}
@@ -768,13 +771,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
+ ret = -EIO;
goto err;
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_block(inode, eh, i, bh))
+ ret = ext4_ext_check_block(inode, eh, i, bh);
+ if (ret < 0)
goto err;
}
@@ -796,7 +801,7 @@ err:
ext4_ext_drop_refs(path);
if (alloc)
kfree(path);
- return ERR_PTR(-EIO);
+ return ERR_PTR(ret);
}
/*
@@ -950,8 +955,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1023,8 +1028,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1136,11 +1141,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return err;
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
- ext4_std_error(inode->i_sb, err);
- return err;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -1960,7 +1962,6 @@ cleanup:
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_invalidate_cache(inode);
return err;
}
@@ -1969,8 +1970,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache newex;
struct ext4_extent *ex;
+ struct extent_status es;
ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
int exists, depth = 0, err = 0;
@@ -2044,37 +2045,47 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
BUG_ON(end <= start);
if (!exists) {
- newex.ec_block = start;
- newex.ec_len = end - start;
- newex.ec_start = 0;
+ es.es_lblk = start;
+ es.es_len = end - start;
+ es.es_pblk = 0;
} else {
- newex.ec_block = le32_to_cpu(ex->ee_block);
- newex.ec_len = ext4_ext_get_actual_len(ex);
- newex.ec_start = ext4_ext_pblock(ex);
+ es.es_lblk = le32_to_cpu(ex->ee_block);
+ es.es_len = ext4_ext_get_actual_len(ex);
+ es.es_pblk = ext4_ext_pblock(ex);
if (ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
/*
- * Find delayed extent and update newex accordingly. We call
- * it even in !exists case to find out whether newex is the
+ * Find delayed extent and update es accordingly. We call
+ * it even in !exists case to find out whether es is the
* last existing extent or not.
*/
- next_del = ext4_find_delayed_extent(inode, &newex);
+ next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
flags |= FIEMAP_EXTENT_DELALLOC;
}
up_read(&EXT4_I(inode)->i_data_sem);
- if (unlikely(newex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+ if (unlikely(es.es_len == 0)) {
+ EXT4_ERROR_INODE(inode, "es.es_len == 0");
err = -EIO;
break;
}
- /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
- if (next == next_del) {
+ /*
+ * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+ * we need to check next == EXT_MAX_BLOCKS because it is
+ * possible that an extent is with unwritten and delayed
+ * status due to when an extent is delayed allocated and
+ * is allocated by fallocate status tree will track both of
+ * them in a extent.
+ *
+ * So we could return a unwritten and delayed extent, and
+ * its block is equal to 'next'.
+ */
+ if (next == next_del && next == EXT_MAX_BLOCKS) {
flags |= FIEMAP_EXTENT_LAST;
if (unlikely(next_del != EXT_MAX_BLOCKS ||
next != EXT_MAX_BLOCKS)) {
@@ -2089,9 +2100,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (exists) {
err = fiemap_fill_next_extent(fieinfo,
- (__u64)newex.ec_block << blksize_bits,
- (__u64)newex.ec_start << blksize_bits,
- (__u64)newex.ec_len << blksize_bits,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
flags);
if (err < 0)
break;
@@ -2101,7 +2112,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
}
- block = newex.ec_block + newex.ec_len;
+ block = es.es_lblk + es.es_len;
}
if (path) {
@@ -2112,21 +2123,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
return err;
}
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
- __u32 len, ext4_fsblk_t start)
-{
- struct ext4_ext_cache *cex;
- BUG_ON(len == 0);
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- trace_ext4_ext_put_in_cache(inode, block, len, start);
- cex = &EXT4_I(inode)->i_cached_extent;
- cex->ec_block = block;
- cex->ec_len = len;
- cex->ec_start = start;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-}
-
/*
* ext4_ext_put_gap_in_cache:
* calculate boundaries of the gap that the requested block fits into
@@ -2143,9 +2139,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ex = path[depth].p_ext;
if (ex == NULL) {
- /* there is no extent yet, so gap is [0;-] */
- lblock = 0;
- len = EXT_MAX_BLOCKS;
+ /*
+ * there is no extent yet, so gap is [0;-] and we
+ * don't cache it
+ */
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -2154,6 +2151,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -2167,58 +2167,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
+ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+ ext4_es_insert_extent(inode, lblock, len, ~0,
+ EXTENT_STATUS_HOLE);
} else {
lblock = len = 0;
BUG();
}
ext_debug(" -> %u:%lu\n", lblock, len);
- ext4_ext_put_in_cache(inode, lblock, len, 0);
-}
-
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * cache extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache *cex;
- int ret = 0;
-
- /*
- * We borrow i_block_reservation_lock to protect i_cached_extent
- */
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- cex = &EXT4_I(inode)->i_cached_extent;
-
- /* has cache valid data? */
- if (cex->ec_len == 0)
- goto errout;
-
- if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
- ext_debug("%u cached by %u:%u:%llu\n",
- block,
- cex->ec_block, cex->ec_len, cex->ec_start);
- ret = 1;
- }
-errout:
- trace_ext4_ext_in_cache(inode, block, ret);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return ret;
}
/*
@@ -2653,13 +2610,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, depth + 1);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
again:
- ext4_ext_invalidate_cache(inode);
-
trace_ext4_ext_remove_space(inode, start, depth);
/*
@@ -3519,19 +3474,19 @@ out:
*
* Return 1 if there is a delalloc block in the range, otherwise 0.
*/
-static int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end)
+int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end)
{
struct extent_status es;
- es.start = lblk_start;
- ext4_es_find_extent(inode, &es);
- if (es.len == 0)
+ ext4_es_find_delayed_extent(inode, lblk_start, &es);
+ if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
- else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
return 1;
- else if (lblk_start <= es.start && es.start <= lblk_end)
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
return 1;
else
return 0;
@@ -3656,6 +3611,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext4_set_io_unwritten_flag(inode, io);
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
goto out;
@@ -3677,8 +3633,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* repeat fallocate creation request
* we already have an unwritten extent
*/
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto map_out;
+ }
/* buffered READ or buffered write_begin() lookup */
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3898,35 +3856,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
- /* check in cache */
- if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
- if (!newex.ee_start_lo && !newex.ee_start_hi) {
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- /*
- * block isn't allocated yet and
- * user doesn't want to allocate it
- */
- goto out2;
- }
- /* we should allocate requested block */
- } else {
- /* block is already allocated */
- if (sbi->s_cluster_ratio > 1)
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
- newblock = map->m_lblk
- - le32_to_cpu(newex.ee_block)
- + ext4_ext_pblock(&newex);
- /* number of remaining blocks in the extent */
- allocated = ext4_ext_get_actual_len(&newex) -
- (map->m_lblk - le32_to_cpu(newex.ee_block));
- goto out;
- }
- }
-
/* find extent for this block */
path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
if (IS_ERR(path)) {
@@ -3973,15 +3902,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /*
- * Do not put uninitialized extent
- * in the cache
- */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
+ if (!ext4_ext_is_uninitialized(ex))
goto out;
- }
+
allocated = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
@@ -4002,7 +3925,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
@@ -4108,6 +4032,7 @@ got_allocated_blocks:
/* Mark uninitialized */
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
* uninitialized extent. To avoid unnecessary conversion,
@@ -4241,10 +4166,9 @@ got_allocated_blocks:
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an uninitialized extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- } else
+ else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > map->m_len)
@@ -4284,7 +4208,7 @@ void ext4_ext_truncate(struct inode *inode)
* probably first extent we're gonna free will be last in block
*/
err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, err);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err);
if (IS_ERR(handle))
return;
@@ -4303,7 +4227,6 @@ void ext4_ext_truncate(struct inode *inode)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
@@ -4397,13 +4320,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- /*
- * currently supporting (pre)allocate mode for extent-based
- * files _only_
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return -EOPNOTSUPP;
-
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -4415,6 +4331,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /*
+ * currently supporting (pre)allocate mode for extent-based
+ * files _only_
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -4451,7 +4374,8 @@ retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
@@ -4459,11 +4383,11 @@ retry:
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, max_blocks);
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
#endif
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -4529,21 +4453,19 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
- if (ret <= 0) {
- WARN_ON(ret <= 0);
- ext4_msg(inode->i_sb, KERN_ERR,
- "%s:%d: inode #%lu: block %u: len %u: "
- "ext4_ext_map_blocks returned %d",
- __func__, __LINE__, inode->i_ino, map.m_lblk,
- map.m_len, ret);
- }
+ if (ret <= 0)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret <= 0 || ret2 )
@@ -4553,42 +4475,48 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
/*
- * If newex is not existing extent (newex->ec_start equals zero) find
- * delayed extent at start of newex and update newex accordingly and
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
* return start of the next delayed extent.
*
- * If newex is existing extent (newex->ec_start is not equal zero)
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
* return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newex unmodified.
+ * extent found. Leave newes unmodified.
*/
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex)
+ struct extent_status *newes)
{
struct extent_status es;
- ext4_lblk_t next_del;
+ ext4_lblk_t block, next_del;
- es.start = newex->ec_block;
- next_del = ext4_es_find_extent(inode, &es);
+ ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
- if (newex->ec_start == 0) {
+ if (newes->es_pblk == 0) {
/*
- * No extent in extent-tree contains block @newex->ec_start,
+ * No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent.
*/
- if (es.len == 0)
+ if (es.es_len == 0)
/* A hole found. */
return 0;
- if (es.start > newex->ec_block) {
+ if (es.es_lblk > newes->es_lblk) {
/* A hole found. */
- newex->ec_len = min(es.start - newex->ec_block,
- newex->ec_len);
+ newes->es_len = min(es.es_lblk - newes->es_lblk,
+ newes->es_len);
return 0;
}
- newex->ec_len = es.start + es.len - newex->ec_block;
+ newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
}
+ block = newes->es_lblk + newes->es_len;
+ ext4_es_find_delayed_extent(inode, block, &es);
+ if (es.es_len == 0)
+ next_del = EXT_MAX_BLOCKS;
+ else
+ next_del = es.es_lblk;
+
return next_del;
}
/* fiemap flags we can handle specified here */
@@ -4709,7 +4637,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto out_dio;
@@ -4786,14 +4714,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
err = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
if (IS_SYNC(inode))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fc..f768f4a98a2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -23,40 +23,53 @@
* (e.g. Reservation space warning), and provide extent-level locking.
* Delay extent tree is the first step to achieve this goal. It is
* original built by Yongqiang Yang. At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
+ * extent tree, whose goal is only track delayed extents in memory to
* simplify the implementation of fiemap and bigalloc, and introduce
* lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
- * delay extent tree at the following comment. But for better
- * understand what it does, it has been rename to extent status tree.
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
*
- * Currently the first step has been done. All delay extents are
- * tracked in the tree. It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
* invalidated. Therefore the implementation of fiemap and bigalloc
* are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
*
* The following comment describes the implemenmtation of extent
* status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory. written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure. Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
*/
/*
- * extents status tree implementation for ext4.
+ * Extent status tree implementation for ext4.
*
*
* ==========================================================================
- * Extents status encompass delayed extents and extent locks
+ * Extent status tree tracks all extent status.
*
- * 1. Why delayed extent implementation ?
+ * 1. Why we need to implement extent status tree?
*
- * Without delayed extent, ext4 identifies a delayed extent by looking
+ * Without extent status tree, ext4 identifies a delayed extent by looking
* up page cache, this has several deficiencies - complicated, buggy,
* and inefficient code.
*
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
*
- * Let us have a look at how they do without delayed extents implementation.
+ * Let us have a look at how they do without extent status tree.
* -- FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes.
*
@@ -68,47 +81,48 @@
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster.
*
- * -- punch hole
- * punch hole looks up page cache to identify a delayed extent.
- *
* -- writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped, If there are not very many delayed buffers, then it is
* time comsuming.
*
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
+ * not by searching the extent tree.
*
*
* ==========================================================================
- * 2. ext4 delayed extents impelmentation
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
*
- * -- delayed extent
- * A delayed extent is a range of blocks which are contiguous
- * logically and under delayed allocation. Unlike extent in
- * ext4, delayed extent in ext4 is a in-memory struct, there is
- * no corresponding on-disk data. There is no limit on length of
- * delayed extent, so a delayed extent can contain as many blocks
- * as they are contiguous logically.
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
*
- * -- delayed extent tree
- * Every inode has a delayed extent tree and all under delayed
- * allocation blocks are added to the tree as delayed extents.
- * Delayed extents in the tree are ordered by logical block no.
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
*
- * -- operations on a delayed extent tree
- * There are three operations on a delayed extent tree: find next
- * delayed extent, adding a space(a range of blocks) and removing
- * a space.
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
*
- * -- race on a delayed extent tree
- * Delayed extent tree is protected inode->i_es_lock.
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten/hole extents from
+ * the tree under a heavy memory pressure.
*
*
* ==========================================================================
- * 3. performance analysis
+ * 3. Performance analysis
+ *
* -- overhead
* 1. There is a cache extent for write access, so if writes are
* not very random, adding space operaions are in O(1) time.
@@ -120,15 +134,21 @@
*
* ==========================================================================
* 4. TODO list
- * -- Track all extent status
*
- * -- Improve get block process
+ * -- Refactor delayed space reservation
*
* -- Extent-level locking
*/
static struct kmem_cache *ext4_es_cachep;
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+static int ext4_es_reclaim_extents_count(struct super_block *sb);
+
int __init ext4_init_es(void)
{
ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
@@ -161,7 +181,9 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ printk(KERN_DEBUG " [%u/%u) %llu %llx",
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
@@ -170,10 +192,10 @@ static void ext4_es_print_tree(struct inode *inode)
#define ext4_es_print_tree(inode)
#endif
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
- BUG_ON(es->start + es->len < es->start);
- return es->start + es->len - 1;
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
}
/*
@@ -181,25 +203,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
* it can't be found, try to find next extent.
*/
static struct extent_status *__es_tree_search(struct rb_root *root,
- ext4_lblk_t offset)
+ ext4_lblk_t lblk)
{
struct rb_node *node = root->rb_node;
struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
- if (offset < es->start)
+ if (lblk < es->es_lblk)
node = node->rb_left;
- else if (offset > extent_status_end(es))
+ else if (lblk > ext4_es_end(es))
node = node->rb_right;
else
return es;
}
- if (es && offset < es->start)
+ if (es && lblk < es->es_lblk)
return es;
- if (es && offset > extent_status_end(es)) {
+ if (es && lblk > ext4_es_end(es)) {
node = rb_next(&es->rb_node);
return node ? rb_entry(node, struct extent_status, rb_node) :
NULL;
@@ -209,79 +231,121 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
+ * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk
+ * if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
* @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
*/
-ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
- ext4_lblk_t ret = EXT_MAX_BLOCKS;
- trace_ext4_es_find_extent_enter(inode, es->start);
+ BUG_ON(es == NULL);
+ trace_ext4_es_find_delayed_extent_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find delay extent in cache firstly */
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
- if (in_range(es->start, es1->start, es1->len)) {
- es_debug("%u cached by [%u/%u)\n",
- es->start, es1->start, es1->len);
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %llx\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
}
}
- es->len = 0;
- es1 = __es_tree_search(&tree->root, es->start);
+ es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1) {
- tree->cache_es = es1;
- es->start = es1->start;
- es->len = es1->len;
- node = rb_next(&es1->rb_node);
- if (node) {
+ if (es1 && !ext4_es_is_delayed(es1)) {
+ while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
- ret = es1->start;
+ if (ext4_es_is_delayed(es1))
+ break;
}
}
+ if (es1 && ext4_es_is_delayed(es1)) {
+ tree->cache_es = es1;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_extent_exit(inode, es, ret);
- return ret;
+ ext4_es_lru_add(inode);
+ trace_ext4_es_find_delayed_extent_exit(inode, es);
}
static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
{
struct extent_status *es;
es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
if (es == NULL)
return NULL;
- es->start = start;
- es->len = len;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es))
+ EXT4_I(inode)->i_es_lru_nr++;
+
return es;
}
-static void ext4_es_free_extent(struct extent_status *es)
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ }
+
kmem_cache_free(ext4_es_cachep, es);
}
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (es1->es_lblk + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if (ext4_es_status(es1) != ext4_es_status(es2))
+ return 0;
+
+ if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+ (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
+ return 0;
+
+ return 1;
+}
+
static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -290,10 +354,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es->start == extent_status_end(es1) + 1) {
- es1->len += es->len;
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
es = es1;
}
@@ -301,8 +365,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
}
static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -311,69 +376,57 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es1->start == extent_status_end(es) + 1) {
- es->len += es1->len;
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
rb_erase(node, &tree->root);
- ext4_es_free_extent(es1);
+ ext4_es_free_extent(inode, es1);
}
return es;
}
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
- ext4_lblk_t len)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct extent_status *es;
- ext4_lblk_t end = offset + len - 1;
-
- BUG_ON(end < offset);
- es = tree->cache_es;
- if (es && offset == (extent_status_end(es) + 1)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- } else if (es && es->start == end + 1) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- } else if (es && es->start <= offset &&
- end <= extent_status_end(es)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- goto out;
- }
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
- if (offset < es->start) {
- if (es->start == end + 1) {
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ /*
+ * Here we can modify es_lblk directly
+ * because it isn't overlapped.
+ */
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es))
+ ext4_es_store_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
goto out;
}
p = &(*p)->rb_left;
- } else if (offset > extent_status_end(es)) {
- if (offset == extent_status_end(es) + 1) {
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
goto out;
}
p = &(*p)->rb_right;
} else {
- if (extent_status_end(es) <= end)
- es->len = offset - es->start + len;
- goto out;
+ BUG_ON(1);
+ return -EINVAL;
}
}
- es = ext4_es_alloc_extent(offset, len);
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
@@ -385,85 +438,166 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_insert_extent() adds a space to a extent status tree.
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent.
*
* Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned long long status)
{
- struct ext4_es_tree *tree;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err = 0;
- trace_ext4_es_insert_extent(inode, offset, len);
- es_debug("add [%u/%u) to extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
+ es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ if (!len)
+ return 0;
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_insert_extent(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- err = __es_insert_extent(tree, offset, len);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+ err = __es_insert_extent(inode, &newes);
+
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
*
- * Return 0 on success, error code on failure.
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
- struct rb_node *node;
struct ext4_es_tree *tree;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, lblk);
+ es_debug("lookup extent in block %u\n", lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_lru_add(inode);
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
struct extent_status *es;
struct extent_status orig_es;
- ext4_lblk_t len1, len2, end;
+ ext4_lblk_t len1, len2;
+ ext4_fsblk_t block;
int err = 0;
- trace_ext4_es_remove_extent(inode, offset, len);
- es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
-
- end = offset + len - 1;
- BUG_ON(end < offset);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- es = __es_tree_search(&tree->root, offset);
+ es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
- if (es->start > end)
+ if (es->es_lblk > end)
goto out;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
- orig_es.start = es->start;
- orig_es.len = es->len;
- len1 = offset > es->start ? offset - es->start : 0;
- len2 = extent_status_end(es) > end ?
- extent_status_end(es) - end : 0;
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
if (len1 > 0)
- es->len = len1;
+ es->es_len = len1;
if (len2 > 0) {
if (len1 > 0) {
- err = __es_insert_extent(tree, end + 1, len2);
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ if (ext4_es_is_written(&orig_es) ||
+ ext4_es_is_unwritten(&orig_es)) {
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock(&newes, block);
+ }
+ ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
if (err) {
- es->start = orig_es.start;
- es->len = orig_es.len;
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
goto out;
}
} else {
- es->start = end + 1;
- es->len = len2;
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es)) {
+ block = orig_es.es_pblk + orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+ }
}
goto out;
}
@@ -476,10 +610,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = NULL;
}
- while (es && extent_status_end(es) <= end) {
+ while (es && ext4_es_end(es) <= end) {
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
if (!node) {
es = NULL;
break;
@@ -487,14 +621,183 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = rb_entry(node, struct extent_status, rb_node);
}
- if (es && es->start < end + 1) {
- len1 = extent_status_end(es) - end;
- es->start = end + 1;
- es->len = len1;
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+ block = es->es_pblk + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+ }
}
out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ if (!len)
+ return err;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp, scanned;
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk = 0;
+
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan);
+
+ if (!nr_to_scan)
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+
+ INIT_LIST_HEAD(&scanned);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ list_move_tail(cur, &scanned);
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ read_lock(&ei->i_es_lock);
+ if (ei->i_es_lru_nr == 0) {
+ read_unlock(&ei->i_es_lock);
+ continue;
+ }
+ read_unlock(&ei->i_es_lock);
+
+ write_lock(&ei->i_es_lock);
+ ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += ret;
+ nr_to_scan -= ret;
+ if (nr_to_scan == 0)
+ break;
+ }
+ list_splice_tail(&scanned, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk);
+
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+}
+
+void ext4_es_register_shrinker(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_shrinker.shrink = ext4_es_shrink;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct super_block *sb)
+{
+ unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ else
+ list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int ext4_es_reclaim_extents_count(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct list_head *cur;
+ int nr_cached = 0;
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each(cur, &sbi->s_es_lru) {
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+ read_lock(&ei->i_es_lock);
+ nr_cached += ei->i_es_lru_nr;
+ read_unlock(&ei->i_es_lock);
+ }
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_reclaim_extents_count(sb, nr_cached);
+ return nr_cached;
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ int nr_shrunk = 0;
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 077f82db092..cf83e77b16c 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -20,10 +20,21 @@
#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+#define EXTENT_STATUS_WRITTEN 0x80000000 /* written extent */
+#define EXTENT_STATUS_UNWRITTEN 0x40000000 /* unwritten extent */
+#define EXTENT_STATUS_DELAYED 0x20000000 /* delayed extent */
+#define EXTENT_STATUS_HOLE 0x10000000 /* hole */
+
+#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
+ EXTENT_STATUS_UNWRITTEN | \
+ EXTENT_STATUS_DELAYED | \
+ EXTENT_STATUS_HOLE)
+
struct extent_status {
struct rb_node rb_node;
- ext4_lblk_t start; /* first block extent covers */
- ext4_lblk_t len; /* length of extent in block */
+ ext4_lblk_t es_lblk; /* first logical block extent covers */
+ ext4_lblk_t es_len; /* length of extent in block */
+ ext4_fsblk_t es_pblk; /* first physical block */
};
struct ext4_es_tree {
@@ -35,11 +46,69 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned long long status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t len);
-extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
- struct extent_status *es);
+extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_WRITTEN);
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_UNWRITTEN);
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_DELAYED);
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_HOLE);
+}
+
+static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+{
+ return (es->es_pblk & EXTENT_STATUS_FLAGS);
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+ return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+ ext4_fsblk_t pb)
+{
+ ext4_fsblk_t block;
+
+ block = (pb & ~EXTENT_STATUS_FLAGS) |
+ (es->es_pblk & EXTENT_STATUS_FLAGS);
+ es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+ unsigned long long status)
+{
+ ext4_fsblk_t block;
+
+ block = (status & EXTENT_STATUS_FLAGS) |
+ (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+ es->es_pblk = block;
+}
+
+extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c00ea7945eb..64848b595b2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -240,7 +240,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
handle_t *handle;
int err;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
@@ -464,10 +464,8 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* it will be as a data.
*/
- es.start = last;
- (void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
+ ext4_es_find_delayed_extent(inode, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
dataoff = last << blkbits;
break;
@@ -549,11 +547,9 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* we will skip this extent.
*/
- es.start = last;
- (void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
- last = es.start + es.len;
+ ext4_es_find_delayed_extent(inode, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ last = es.es_lblk + es.es_len;
holeoff = last << blkbits;
continue;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index fa8e4911d35..3d586f02883 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
for (i = 0; i < 4; i++) {
- if (hinfo->seed[i])
+ if (hinfo->seed[i]) {
+ memcpy(buf, hinfo->seed, sizeof(buf));
break;
+ }
}
- if (i < 4)
- memcpy(buf, hinfo->seed, sizeof(buf));
}
switch (hinfo->hash_version) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3f32c801244..32fd2b9075d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
- const struct qstr *qstr, __u32 goal, uid_t *owner)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ umode_t mode, const struct qstr *qstr,
+ __u32 goal, uid_t *owner, int handle_type,
+ unsigned int line_no, int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -725,6 +727,15 @@ repeat_in_this_group:
"inode=%lu", ino + 1);
continue;
}
+ if (!handle) {
+ BUG_ON(nblocks <= 0);
+ handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+ handle_type, nblocks);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto fail;
+ }
+ }
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
if (err)
@@ -1017,17 +1028,17 @@ iget_failed:
inode = NULL;
bad_orphan:
ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
bit, (unsigned long long)bitmap_bh->b_blocknr,
ext4_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
+ printk(KERN_WARNING "inode=%p\n", inode);
if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
- printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+ printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+ printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
@@ -1137,7 +1148,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
goto out;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 20862f96e8a..c541ab8b64d 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ int ret = -EIO;
*err = 0;
/* i_data is not going away, no lock needed */
@@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
goto no_block;
while (--depth) {
bh = sb_getblk(sb, le32_to_cpu(p->key));
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto failure;
+ }
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
return NULL;
failure:
- *err = -EIO;
+ *err = ret;
no_block:
return p;
}
@@ -355,9 +358,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* for the first direct block
*/
new_blocks[index] = current_block;
- printk(KERN_INFO "%s returned more blocks than "
+ WARN(1, KERN_INFO "%s returned more blocks than "
"requested\n", __func__);
- WARN_ON(1);
break;
}
}
@@ -471,7 +473,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
if (unlikely(!bh)) {
- err = -EIO;
+ err = -ENOMEM;
goto failed;
}
@@ -789,7 +791,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
if (final_size > inode->i_size) {
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -849,7 +851,7 @@ locked:
int err;
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
* but cannot extend i_size. Bail out and pretend
@@ -948,7 +950,8 @@ static handle_t *start_transaction(struct inode *inode)
{
handle_t *result;
- result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
+ result = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode));
if (!IS_ERR(result))
return result;
@@ -1515,3 +1518,243 @@ out_stop:
trace_ext4_truncate_exit(inode);
}
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh, __le32 *i_data,
+ int level, ext4_lblk_t first,
+ ext4_lblk_t count, int max)
+{
+ struct buffer_head *bh = NULL;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ret = 0;
+ int i, inc;
+ ext4_lblk_t offset;
+ __le32 blk;
+
+ inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+ for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+ if (offset >= count + first)
+ break;
+ if (*i_data == 0 || (offset + inc) <= first)
+ continue;
+ blk = *i_data;
+ if (level > 0) {
+ ext4_lblk_t first2;
+ bh = sb_bread(inode->i_sb, blk);
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, blk,
+ "Read failure");
+ return -EIO;
+ }
+ first2 = (first > offset) ? first - offset : 0;
+ ret = free_hole_blocks(handle, inode, bh,
+ (__le32 *)bh->b_data, level - 1,
+ first2, count - offset,
+ inode->i_sb->s_blocksize >> 2);
+ if (ret) {
+ brelse(bh);
+ goto err;
+ }
+ }
+ if (level == 0 ||
+ (bh && all_zeroes((__le32 *)bh->b_data,
+ (__le32 *)bh->b_data + addr_per_block))) {
+ ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
+ *i_data = 0;
+ }
+ brelse(bh);
+ bh = NULL;
+ }
+
+err:
+ return ret;
+}
+
+static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
+{
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int level, ret = 0;
+ int num = EXT4_NDIR_BLOCKS;
+ ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+ __le32 *i_data = EXT4_I(inode)->i_data;
+
+ count = stop - first;
+ for (level = 0; level < 4; level++, max *= addr_per_block) {
+ if (first < max) {
+ ret = free_hole_blocks(handle, inode, NULL, i_data,
+ level, first, count, num);
+ if (ret)
+ goto err;
+ if (count > max - first)
+ count -= max - first;
+ else
+ break;
+ first = 0;
+ } else {
+ first -= max;
+ }
+ i_data += num;
+ if (level == 0) {
+ num = 1;
+ max = 1;
+ }
+ }
+
+err:
+ return ret;
+}
+
+int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle = NULL;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+ int err = 0;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+
+ /*
+ * If the hole extents beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_pagecache_range(inode, first_page_offset,
+ last_page_offset - 1);
+ }
+
+ /* Wait all existing dio works, newcomers will block on i_mutex */
+ inode_dio_wait(inode);
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle))
+ goto out_mutex;
+
+ /*
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the buffer
+ * heads for the block aligned regions of the page that were
+ * completely zerod.
+ */
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained within a page
+ * just zero out and unmap the middle of that page
+ */
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+ if (err)
+ goto out;
+ } else {
+ /*
+ * Zero out and unmap the paritial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If i_size contained in the last page, we need to
+ * unmap and zero the paritial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ if (first_block >= stop_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ err = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
+
+ ext4_discard_preallocations(inode);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
+}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8106dca9545..c0fd1a123f7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -545,7 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
retry:
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -657,7 +657,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
* The possible write could happen in the inode,
* so try to reserve the space in inode first.
*/
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -853,7 +853,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
if (ret)
return ret;
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
data_bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!data_bh) {
- error = -EIO;
+ error = -ENOMEM;
goto out_restore;
}
@@ -1770,7 +1770,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
needed_blocks = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return;
@@ -1862,7 +1862,7 @@ int ext4_convert_inline_data(struct inode *inode)
if (error)
return error;
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto out_free;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 521bd4ab8ab..9c4f4b1c97f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)
* protection against it
*/
sb_start_intwrite(inode->i_sb);
- handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
spin_lock(&ei->i_block_reservation_lock);
trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
"with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
@@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
}
if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
- ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
- "with only %d reserved metadata blocks\n", __func__,
- inode->i_ino, ei->i_allocated_meta_blocks,
- ei->i_reserved_meta_blocks);
+ ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ "with only %d reserved metadata blocks "
+ "(releasing %d blocks with reserved %d data blocks)",
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks, used,
+ ei->i_reserved_data_blocks);
WARN_ON(1);
ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
}
@@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
map->m_flags = 0;
ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) +
+ map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+ goto found;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
+ if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
up_read((&EXT4_I(inode)->i_data_sem));
+found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /* delayed alloc may be allocated by fallocate and
- * coverted to initialized by directIO.
- * we need to handle delayed extent here.
- */
- down_write((&EXT4_I(inode)->i_data_sem));
- goto delayed_mapped;
- }
- ret = check_block_validity(inode, map);
+ int ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return retval;
/*
- * When we call get_blocks without the create flag, the
- * BH_Unwritten flag could have gotten set if the blocks
- * requested were part of a uninitialized extent. We need to
- * clear this flag now that we are committed to convert all or
- * part of the uninitialized extent to be an initialized
- * extent. This is because we need to avoid the combination
- * of BH_Unwritten and BH_Mapped flags being simultaneously
- * set on the buffer_head.
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
*/
- map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+ map->m_flags &= ~EXT4_MAP_FLAGS;
/*
* New blocks allocate and/or writing to uninitialized extent
@@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
-delayed_mapped:
- /* delayed allocation blocks has been allocated */
- ret = ext4_es_remove_extent(inode, map->m_lblk,
- map->m_len);
- if (ret < 0)
- retval = ret;
- }
+ if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ ext4_find_delalloc_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, dio_credits);
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
return ret;
@@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
/* ensure we send some value back into *errp */
*errp = 0;
+ if (create && err == 0)
+ err = -ENOSPC; /* should never happen */
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
return NULL;
}
if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
flags, pagep);
if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret == 1)
+ return 0;
}
-retry:
- handle = ext4_journal_start(inode, needed_blocks);
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
-
- *pagep = page;
+ wait_on_page_writeback(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +954,6 @@ retry:
if (ret) {
unlock_page(page);
- page_cache_release(page);
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -942,11 +977,14 @@ retry:
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
- }
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ page_cache_release(page);
+ return ret;
+ }
+ *pagep = page;
return ret;
}
@@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* function is called from invalidate page, it's
* harmless to return without any action.
*/
- ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ ext4_warning(inode->i_sb, "ext4_da_release_space: "
"ino %lu, to_free %d with only %d reserved "
"data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
@@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
loff_t size = i_size_read(inode);
unsigned int len, block_start;
struct buffer_head *bh, *page_bufs = NULL;
- int journal_data = ext4_should_journal_data(inode);
sector_t pblock = 0, cur_logical = 0;
struct ext4_io_submit io_submit;
@@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, skip_page = 0;
+ int skip_page = 0;
struct page *page = pvec.pages[i];
index = page->index;
@@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- /*
- * If the page does not have buffers (for
- * whatever reason), try to create them using
- * __block_write_begin. If this fails,
- * skip the page and move on.
- */
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- skip_page:
- unlock_page(page);
- continue;
- }
- commit_write = 1;
- }
-
bh = page_bufs = page_buffers(page);
block_start = 0;
do {
- if (!bh)
- goto skip_page;
if (map && (cur_logical >= map->m_lblk) &&
(cur_logical <= (map->m_lblk +
(map->m_len - 1)))) {
@@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
pblock++;
} while (bh != page_bufs);
- if (skip_page)
- goto skip_page;
-
- if (commit_write)
- /* mark the buffer_heads as dirty & uptodate */
- block_commit_write(page, 0, len);
+ if (skip_page) {
+ unlock_page(page);
+ continue;
+ }
clear_page_dirty_for_io(page);
- /*
- * Delalloc doesn't support data journalling,
- * but eventually maybe we'll lift this
- * restriction.
- */
- if (unlikely(journal_data && PageChecked(page)))
- err = __ext4_journalled_writepage(page, len);
- else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
- err = ext4_bio_write_page(&io_submit, page,
- len, mpd->wbc);
- else if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- err = block_write_full_page_endio(page,
- noalloc_get_block_write,
- mpd->wbc, ext4_end_io_buffer_write);
- } else
- err = block_write_full_page(page,
- noalloc_get_block_write, mpd->wbc);
-
+ err = ext4_bio_write_page(&io_submit, page, len,
+ mpd->wbc);
if (!err)
mpd->pages_written++;
/*
@@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost\n");
+ "This should not happen!! Data will be lost");
if (err == -ENOSPC)
ext4_print_free_blocks(mpd->inode);
}
@@ -1690,16 +1690,16 @@ submit_io:
*
* @mpd->lbh - extent of blocks
* @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
+ * @b_state - b_state of the buffer head added
*
* the function is used to collect contig. blocks in same state
*/
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, size_t b_size,
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
unsigned long b_state)
{
sector_t next;
- int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ int blkbits = mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> blkbits;
/*
* XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* mpage_da_submit_io() into this function and then call
* ext4_map_blocks() multiple times in a loop
*/
- if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ if (nrblocks >= (8*1024*1024 >> blkbits))
goto flush_it;
- /* check if thereserved journal credits might overflow */
- if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ /* check if the reserved journal credits might overflow */
+ if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* nrblocks. So limit nrblocks.
*/
goto flush_it;
- } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- EXT4_MAX_TRANS_DATA) {
- /*
- * Adding the new buffer_head would make it cross the
- * allowed limit for which we have journal credit
- * reserved. So limit the new bh->b_size
- */
- b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- mpd->inode->i_blkbits;
- /* we will do mpage_da_submit_io in the next loop */
}
}
/*
@@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
*/
if (mpd->b_size == 0) {
mpd->b_blocknr = logical;
- mpd->b_size = b_size;
+ mpd->b_size = 1 << blkbits;
mpd->b_state = b_state & BH_FLAGS;
return;
}
@@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* Can we merge the block to our big extent?
*/
if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += b_size;
+ mpd->b_size += 1 << blkbits;
return;
}
@@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
struct ext4_map_blocks *map,
struct buffer_head *bh)
{
+ struct extent_status es;
int retval;
sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ if (ext4_es_lookup_extent(inode, iblock, &es)) {
+
+ if (ext4_es_is_hole(&es)) {
+ retval = 0;
+ down_read((&EXT4_I(inode)->i_data_sem));
+ goto add_delayed;
+ }
+
+ /*
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
+ */
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
+ map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es))
+ map->m_flags |= EXT4_MAP_MAPPED;
+ else if (ext4_es_is_unwritten(&es))
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ else
+ BUG_ON(1);
+
+ return retval;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
retval = 0;
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+ retval = ext4_ext_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+ retval = ext4_ind_map_blocks(NULL, inode, map,
+ EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
if (retval == 0) {
+ int ret;
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
@@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* If the block was allocated from previously allocated cluster,
* then we dont need to reserve it again. */
if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
- retval = ext4_da_reserve_space(inode, iblock);
- if (retval)
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
/* not enough space to reserve */
+ retval = ret;
goto out_unlock;
+ }
}
- retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
- if (retval)
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
goto out_unlock;
+ }
/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
* and it should not appear on the bh->b_state.
@@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret;
+ unsigned long long status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
}
out_unlock:
@@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks. It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling block_write_full_page(). Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
get_bh(bh);
@@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,
* references to buffers so we are safe */
unlock_page(page);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -2035,11 +2061,12 @@ out:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
- int ret = 0, commit_write = 0;
+ int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
trace_ext4_writepage(page);
size = i_size_read(inode);
@@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_CACHE_SIZE;
+ page_bufs = page_buffers(page);
/*
- * If the page does not have buffers (for whatever reason),
- * try to create them using __block_write_begin. If this
- * fails, redirty the page and move on.
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
*/
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(wbc, page);
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
unlock_page(page);
return 0;
}
- commit_write = 1;
- }
- page_bufs = page_buffers(page);
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation, so redirty
- * the page and return. We may reach here when we do
- * a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list but it
- * should never be for direct reclaim so warn if that
- * happens
- */
- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC);
- goto redirty_page;
}
- if (commit_write)
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
@@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
-
+ memset(&io_submit, 0, sizeof(io_submit));
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ ext4_io_submit(&io_submit);
return ret;
}
@@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,
logical = (sector_t) page->index <<
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical,
- PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- if (mpd->io_done)
- goto ret_extent_tail;
- } else {
+ /* Add all dirty buffers to mpd */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
/*
- * Page with regular buffer heads,
- * just add all dirty ones
+ * We need to try to allocate unmapped blocks
+ * in the same page. Otherwise we won't make
+ * progress with the page in ext4_writepage
*/
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
+ if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_state);
+ if (mpd->io_done)
+ goto ret_extent_tail;
+ } else if (buffer_dirty(bh) &&
+ buffer_mapped(bh)) {
/*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
+ * mapped dirty buffer. We need to
+ * update the b_state because we look
+ * at b_state in mpage_da_map_blocks.
+ * We don't update b_size because if we
+ * find an unmapped buffer_head later
+ * we need to use the b_state flag of
+ * that buffer_head.
*/
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need
- * to update the b_state
- * because we look at b_state
- * in mpage_da_map_blocks. We
- * don't update b_size because
- * if we find an unmapped
- * buffer_head later we need to
- * use the b_state flag of that
- * buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
- }
+ if (mpd->b_size == 0)
+ mpd->b_state =
+ bh->b_state & BH_FLAGS;
+ }
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
if (nr_to_write > 0) {
nr_to_write--;
@@ -2413,7 +2412,8 @@ retry:
needed_blocks = ext4_da_writepages_trans_blocks(inode);
/* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2555,42 +2555,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pos, len, flags,
pagep, fsdata);
if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret == 1)
+ return 0;
}
-retry:
+ /*
+ * grab_cache_page_write_begin() can take a long time if the
+ * system is thrashing due to memory pressure, or if the page
+ * is being written back. So grab it first before we start
+ * the transaction handle. This also allows us to allocate
+ * the page (if needed) without using GFP_NOFS.
+ */
+retry_grab:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ unlock_page(page);
+
/*
* With delayed allocation, we don't log the i_disksize update
* if there is delayed block allocation. But we still need
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
- handle = ext4_journal_start(inode, 1);
+retry_journal:
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ page_cache_release(page);
+ return PTR_ERR(handle);
}
- /* We cannot recurse into the filesystem as the transaction is already
- * started */
- flags |= AOP_FLAG_NOFS;
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ unlock_page(page);
+ page_cache_release(page);
ext4_journal_stop(handle);
- ret = -ENOMEM;
- goto out;
+ goto retry_grab;
}
- *pagep = page;
+ /* In case writeback began while the page was unlocked */
+ wait_on_page_writeback(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
- page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
@@ -2598,11 +2608,16 @@ retry:
*/
if (pos + len > inode->i_size)
ext4_truncate_failed_write(inode);
+
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+
+ page_cache_release(page);
+ return ret;
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
+ *pagep = page;
return ret;
}
@@ -2858,36 +2873,10 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
-{
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
-
- if (!page_has_buffers(page))
- return;
- head = bh = page_buffers(page);
- do {
- if (offset <= curr_off && test_clear_buffer_uninit(bh)
- && bh->b_private) {
- ext4_free_io_end(bh->b_private);
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- }
- curr_off = curr_off + bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
-}
-
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
trace_ext4_invalidatepage(page, offset);
- /*
- * free any io_end structure allocated for buffers to be discarded
- */
- if (ext4_should_dioread_nolock(page->mapping->host))
- ext4_invalidatepage_free_endio(page, offset);
-
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
@@ -2977,9 +2966,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
out:
+ inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
- inode_dio_done(inode);
return;
}
@@ -2993,65 +2982,6 @@ out:
ext4_add_complete_io(io_end);
}
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
- ext4_io_end_t *io_end = bh->b_private;
- struct inode *inode;
-
- if (!test_clear_buffer_uninit(bh) || !io_end)
- goto out;
-
- if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- ext4_msg(io_end->inode->i_sb, KERN_INFO,
- "sb umounted, discard end_io request for inode %lu",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- goto out;
- }
-
- /*
- * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
- * but being more careful is always safe for the future change.
- */
- inode = io_end->inode;
- ext4_set_io_unwritten_flag(inode, io_end);
- ext4_add_complete_io(io_end);
-out:
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- clear_buffer_uninit(bh);
- end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
- loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- size_t size = bh->b_size;
-
-retry:
- io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- if (!io_end) {
- pr_warn_ratelimited("%s: allocation fail\n", __func__);
- schedule();
- goto retry;
- }
- io_end->offset = offset;
- io_end->size = size;
- /*
- * We need to hold a reference to the page to make sure it
- * doesn't get evicted before ext4_end_io_work() has a chance
- * to convert the extent from written to unwritten.
- */
- io_end->page = page;
- get_page(io_end->page);
-
- bh->b_private = io_end;
- bh->b_end_io = ext4_end_io_buffer_write;
- return 0;
-}
-
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
@@ -3557,16 +3487,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- /* TODO: Add support for non extent hole punching */
- return -EOPNOTSUPP;
- }
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ind_punch_hole(file, offset, length);
if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
return -EOPNOTSUPP;
}
+ trace_ext4_punch_hole(inode, offset, length);
+
return ext4_ext_punch_hole(file, offset, length);
}
@@ -3660,11 +3590,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- return -EIO;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -3696,7 +3623,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* Is the inode bitmap in cache? */
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
@@ -4404,8 +4331,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -4440,7 +4368,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
(attr->ia_size < inode->i_size)) {
handle_t *handle;
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -4460,7 +4388,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size);
if (error) {
/* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode,
+ EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
goto err_out;
@@ -4801,7 +4730,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
@@ -4902,7 +4831,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4968,7 +4897,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
ret = VM_FAULT_LOCKED;
goto out;
}
@@ -4980,7 +4909,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
goto out;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c2f8e060f63..721f4d33e14 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,7 +104,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
} else if (oldflags & EXT4_EOFBLOCKS_FL)
ext4_truncate(inode);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto flags_out;
@@ -173,7 +173,7 @@ flags_out:
}
mutex_lock(&inode->i_mutex);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto unlock_out;
@@ -313,6 +313,9 @@ mext_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input.group);
group_add_out:
ext4_resize_end(sb);
return err;
@@ -358,6 +361,7 @@ group_add_out:
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
@@ -388,6 +392,11 @@ group_add_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
resizefs_out:
ext4_resize_end(sb);
return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1bf6fe785c4..6540ebe058e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,11 +23,18 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
-#include <linux/debugfs.h>
#include <linux/log2.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -1884,15 +1891,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
- if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
-
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -2007,7 +2018,7 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2656,40 +2667,6 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
-
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
-
-static void __init ext4_create_debugfs_entry(void)
-{
- debugfs_dir = debugfs_create_dir("ext4", NULL);
- if (debugfs_dir)
- debugfs_debug = debugfs_create_u8("mballoc-debug",
- S_IRUGO | S_IWUSR,
- debugfs_dir,
- &mb_enable_debug);
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
- debugfs_remove(debugfs_debug);
- debugfs_remove(debugfs_dir);
-}
-
-#else
-
-static void __init ext4_create_debugfs_entry(void)
-{
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2711,7 +2688,6 @@ int __init ext4_init_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
- ext4_create_debugfs_entry();
return 0;
}
@@ -2726,7 +2702,6 @@ void ext4_exit_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_data_cachep);
ext4_groupinfo_destroy_slabs();
- ext4_remove_debugfs_entry();
}
@@ -3872,7 +3847,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
struct super_block *sb = ac->ac_sb;
ext4_group_t ngroups, i;
- if (!mb_enable_debug ||
+ if (!ext4_mballoc_debug ||
(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
@@ -4005,8 +3980,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
- len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4136,7 +4111,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
- rcu_read_lock();
+ spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
@@ -4160,12 +4135,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
- rcu_read_unlock();
+ spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
- order, lg_prealloc_count);
+ order, lg_prealloc_count);
return;
}
return ;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3ccd889ba95..08481ee84cd 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,11 @@
/*
*/
#ifdef CONFIG_EXT4_DEBUG
-extern u8 mb_enable_debug;
+extern ushort ext4_mballoc_debug;
#define mb_debug(n, fmt, a...) \
do { \
- if ((n) <= mb_enable_debug) { \
+ if ((n) <= ext4_mballoc_debug) { \
printk(KERN_DEBUG "(%s, %d): %s: ", \
__FILE__, __LINE__, __func__); \
printk(fmt, ## a); \
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index db8226d595f..480acf4a085 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -456,11 +456,14 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
- handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
- + 1);
+ /*
+ * Worst case we can touch the allocation bitmaps, a bgd
+ * block, and a block to link in the orphan list. We do need
+ * need to worry about credits for modifying the quota inode.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+ 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
return retval;
@@ -507,7 +510,7 @@ int ext4_ext_migrate(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
/*
* It is impossible to update on-disk structures without
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index fe7c63f4717..f9b551561d2 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
+ if (!*bh)
+ return -ENOMEM;
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
@@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
*bh = NULL;
}
}
- if (!*bh) {
+ if (unlikely(!*bh)) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 796f7ac0370..4e81d47aa8c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -681,6 +681,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
+ if (unlikely(!dext))
+ goto missing_donor_extent;
tmp_dext = *dext;
*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
@@ -691,7 +693,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- if (!dext) {
+ if (unlikely(!dext)) {
+ missing_donor_extent:
EXT4_ERROR_INODE(donor_inode,
"The extent for donor must be found");
*err = -EIO;
@@ -761,9 +764,6 @@ out:
kfree(donor_path);
}
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
-
return replaced_count;
}
@@ -920,7 +920,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
again:
*err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
- handle = ext4_journal_start(orig_inode, jblocks);
+ handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
return 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bb97ad6905b..3825d6aa833 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -47,38 +47,111 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
- ext4_lblk_t *block, int *err)
+ ext4_lblk_t *block)
{
struct buffer_head *bh;
+ int err = 0;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
((inode->i_size >> 10) >=
- EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
- *err = -ENOSPC;
- return NULL;
- }
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+ return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext4_bread(handle, inode, *block, 1, err);
- if (bh) {
- inode->i_size += inode->i_sb->s_blocksize;
- EXT4_I(inode)->i_disksize = inode->i_size;
- *err = ext4_journal_get_write_access(handle, bh);
- if (*err) {
+ bh = ext4_bread(handle, inode, *block, 1, &err);
+ if (!bh)
+ return ERR_PTR(err);
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
+ }
+ return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+ struct ext4_dir_entry *dirent);
+
+typedef enum {
+ EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+ __ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+ ext4_lblk_t block,
+ dirblock_type_t type,
+ unsigned int line)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry *dirent;
+ int err = 0, is_dx_block = 0;
+
+ bh = ext4_bread(NULL, inode, block, 0, &err);
+ if (!bh) {
+ if (err == 0) {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory hole found");
+ return ERR_PTR(-EIO);
+ }
+ __ext4_warning(inode->i_sb, __func__, line,
+ "error reading directory block "
+ "(ino %lu, block %lu)", inode->i_ino,
+ (unsigned long) block);
+ return ERR_PTR(err);
+ }
+ dirent = (struct ext4_dir_entry *) bh->b_data;
+ /* Determine whether or not we have an index block */
+ if (is_dx(inode)) {
+ if (block == 0)
+ is_dx_block = 1;
+ else if (ext4_rec_len_from_disk(dirent->rec_len,
+ inode->i_sb->s_blocksize) ==
+ inode->i_sb->s_blocksize)
+ is_dx_block = 1;
+ }
+ if (!is_dx_block && type == INDEX) {
+ ext4_error_inode(inode, __func__, line, block,
+ "directory leaf block found instead of index block");
+ return ERR_PTR(-EIO);
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ buffer_verified(bh))
+ return bh;
+
+ /*
+ * An empty leaf block can get mistaken for a index block; for
+ * this reason, we can only check the index checksum when the
+ * caller is sure it should be an index block.
+ */
+ if (is_dx_block && type == INDEX) {
+ if (ext4_dx_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory index failed checksum");
brelse(bh);
- bh = NULL;
+ return ERR_PTR(-EIO);
}
}
- if (!bh && !(*err)) {
- *err = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
+ if (!is_dx_block) {
+ if (ext4_dirent_csum_verify(inode, dirent))
+ set_buffer_verified(bh);
+ else {
+ ext4_error_inode(inode, __func__, line, block,
+ "Directory block failed checksum");
+ brelse(bh);
+ return ERR_PTR(-EIO);
+ }
}
return bh;
}
@@ -604,9 +677,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
- if (*err == 0)
- *err = ERR_BAD_DX_DIR;
+ bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail;
}
root = (struct dx_root *) bh->b_data;
@@ -643,15 +716,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail;
}
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Root failed checksum");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
- set_buffer_verified(bh);
-
entries = (struct dx_entry *) (((char *)&root->info) +
root->info.info_length);
@@ -709,22 +773,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
- if (!(*err))
- *err = ERR_BAD_DX_DIR;
- goto fail2;
- }
- at = entries = ((struct dx_node *) bh->b_data)->entries;
-
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Node failed checksum");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
+ bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto fail2;
}
- set_buffer_verified(bh);
+ entries = ((struct dx_node *) bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
@@ -783,7 +837,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
{
struct dx_frame *p;
struct buffer_head *bh;
- int err, num_frames = 0;
+ int num_frames = 0;
__u32 bhash;
p = frame;
@@ -822,25 +876,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
* block so no check is necessary
*/
while (num_frames--) {
- if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err))) {
- if (!err) {
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- return -EIO;
- }
- return err; /* Failure */
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dx_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- ext4_warning(dir->i_sb, "Node failed checksum");
- return -EIO;
- }
- set_buffer_verified(bh);
-
+ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
p++;
brelse(p->bh);
p->bh = bh;
@@ -866,20 +904,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
- return err;
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
- return -EIO;
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de +
@@ -1333,26 +1360,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
- if (!(*err)) {
- *err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
- goto errout;
- }
-
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(dir, "checksumming directory "
- "block %lu", (unsigned long)block);
- brelse(bh);
- *err = -EIO;
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh)) {
+ *err = PTR_ERR(bh);
goto errout;
}
- set_buffer_verified(bh);
retval = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
@@ -1536,11 +1548,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2)) {
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- goto errout;
+ *error = PTR_ERR(bh2);
+ return NULL;
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -1621,7 +1634,6 @@ journal_error:
brelse(bh2);
*bh = NULL;
ext4_std_error(dir->i_sb, err);
-errout:
*error = err;
return NULL;
}
@@ -1699,7 +1711,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned int blocksize = dir->i_sb->s_blocksize;
- unsigned short reclen;
int csum_size = 0;
int err;
@@ -1707,7 +1718,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
- reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
err = ext4_find_dest_de(dir, inode,
bh, bh->b_data, blocksize - csum_size,
@@ -1798,10 +1808,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
- bh2 = ext4_append(handle, dir, &block, &retval);
- if (!(bh2)) {
+ bh2 = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh2)) {
brelse(bh);
- return retval;
+ return PTR_ERR(bh2);
}
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
@@ -1918,20 +1928,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
- if (!retval) {
- retval = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
- return retval;
- }
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data))
- return -EIO;
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC) {
brelse(bh);
@@ -1943,9 +1943,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
- bh = ext4_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
+ bh = ext4_append(handle, dir, &block);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -1982,22 +1982,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
return err;
entries = frame->entries;
at = frame->at;
-
- if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(dir->i_sb,
- "Directory hole detected on inode %lu\n",
- dir->i_ino);
- }
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
goto cleanup;
}
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
- goto journal_error;
- set_buffer_verified(bh);
-
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -2025,9 +2016,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
err = -ENOSPC;
goto cleanup;
}
- bh2 = ext4_append (handle, dir, &newblock, &err);
- if (!(bh2))
+ bh2 = ext4_append(handle, dir, &newblock);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
goto cleanup;
+ }
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
memset(&node2->fake, 0, sizeof(struct fake_dirent));
@@ -2106,8 +2099,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
journal_error:
ext4_std_error(dir->i_sb, err);
cleanup:
- if (bh)
- brelse(bh);
+ brelse(bh);
dx_release(frames);
return err;
}
@@ -2254,29 +2246,28 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2287,31 +2278,30 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (!new_valid_dev(rdev))
return -EINVAL;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+ NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
}
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2351,6 +2341,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
+ ext4_lblk_t block = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err;
@@ -2367,16 +2358,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
- inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
- if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
- goto out;
- }
+ inode->i_size = 0;
+ dir_block = ext4_append(handle, inode, &block);
+ if (IS_ERR(dir_block))
+ return PTR_ERR(dir_block);
BUFFER_TRACE(dir_block, "get_write_access");
err = ext4_journal_get_write_access(handle, dir_block);
if (err)
@@ -2403,25 +2388,21 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
- int err, retries = 0;
+ int err, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
dquot_initialize(dir);
+ credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
- &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+ &dentry->d_name,
+ 0, NULL, EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2449,8 +2430,12 @@ out_clear_inode:
goto out_clear_inode;
unlock_new_inode(inode);
d_instantiate(dentry, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2476,25 +2461,14 @@ static int empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory lblock 0", err);
- else
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+ EXT4_ERROR_INODE(inode, "invalid size");
return 1;
}
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(inode,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(inode, "checksum error reading directory "
- "lblock 0");
- return -EIO;
- }
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh))
+ return 1;
+
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -2517,28 +2491,9 @@ static int empty_dir(struct inode *inode)
err = 0;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- bh = ext4_bread(NULL, inode, lblock, 0, &err);
- if (!bh) {
- if (err)
- EXT4_ERROR_INODE(inode,
- "error %d reading directory "
- "lblock %u", err, lblock);
- else
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
-
- offset += sb->s_blocksize;
- continue;
- }
- if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(inode,
- (struct ext4_dir_entry *)bh->b_data)) {
- EXT4_ERROR_INODE(inode, "checksum error "
- "reading directory lblock 0");
- return -EIO;
- }
- set_buffer_verified(bh);
+ bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (IS_ERR(bh))
+ return 1;
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
if (ext4_check_dir_entry(inode, NULL, de, bh,
@@ -2717,25 +2672,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
inode = dentry->d_inode;
retval = -EIO;
@@ -2746,6 +2694,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (!empty_dir(inode))
goto end_rmdir;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rmdir;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
@@ -2767,8 +2726,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_mark_inode_dirty(handle, dir);
end_rmdir:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
@@ -2778,7 +2738,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle;
+ handle_t *handle = NULL;
trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
@@ -2786,13 +2746,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
@@ -2804,6 +2757,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_unlink;
+ }
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
if (!inode->i_nlink) {
ext4_warning(inode->i_sb,
"Deleting nonexistent file (%lu), %d",
@@ -2824,8 +2788,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = 0;
end_unlink:
- ext4_journal_stop(handle);
brelse(bh);
+ if (handle)
+ ext4_journal_stop(handle);
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
@@ -2865,15 +2830,10 @@ static int ext4_symlink(struct inode *dir,
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
}
retry:
- handle = ext4_journal_start(dir, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
- &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+ handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2903,7 +2863,7 @@ retry:
* Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
* + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
*/
- handle = ext4_journal_start(dir,
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
@@ -2926,8 +2886,12 @@ retry:
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
+ if (!err && IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
out_stop:
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2950,8 +2914,9 @@ static int ext4_link(struct dentry *old_dentry,
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2991,13 +2956,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
- if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
- if (!*retval) {
- *retval = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
+ bh = ext4_read_dirblock(inode, 0, EITHER);
+ if (IS_ERR(bh)) {
+ *retval = PTR_ERR(bh);
return NULL;
}
*parent_de = ext4_next_entry(
@@ -3034,9 +2995,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* in separate transaction */
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, 2 *
- EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -3076,11 +3037,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
&inlined);
if (!dir_bh)
goto end_rename;
- if (!inlined && !buffer_verified(dir_bh) &&
- !ext4_dirent_csum_verify(old_inode,
- (struct ext4_dir_entry *)dir_bh->b_data))
- goto end_rename;
- set_buffer_verified(dir_bh);
if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbca2a4..809b31003ec 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -73,8 +74,6 @@ void ext4_free_io_end(ext4_io_end_t *io)
BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
- if (io->page)
- put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
@@ -103,14 +102,13 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
-
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
+ if (io->iocb)
+ aio_complete(io->iocb, io->result, 0);
return ret;
}
@@ -119,7 +117,6 @@ static void dump_completed_IO(struct inode *inode)
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
ext4_debug("inode %lu completed_io list is empty\n",
@@ -152,26 +149,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list)) {
- io_end->flag |= EXT4_IO_END_QUEUED;
- queue_work(wq, &io_end->work);
- }
+ if (list_empty(&ei->i_completed_io_list))
+ queue_work(wq, &ei->i_unwritten_work);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode,
- ext4_io_end_t *work_io)
+static int ext4_do_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
- struct list_head unwritten, complete, to_free;
+ struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, ret = 0;
- INIT_LIST_HEAD(&complete);
- INIT_LIST_HEAD(&to_free);
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode);
list_replace_init(&ei->i_completed_io_list, &unwritten);
@@ -185,32 +176,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
-
- list_add_tail(&io->list, &complete);
- }
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&complete)) {
- io = list_entry(complete.next, ext4_io_end_t, list);
io->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* end_io context can not be destroyed now because it still
- * used by queued worker. Worker thread will destroy it later */
- if (io->flag & EXT4_IO_END_QUEUED)
- list_del_init(&io->list);
- else
- list_move(&io->list, &to_free);
- }
- /* If we are called from worker context, it is time to clear queued
- * flag, and destroy it's end_io if it was converted already */
- if (work_io) {
- work_io->flag &= ~EXT4_IO_END_QUEUED;
- if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
- list_add_tail(&work_io->list, &to_free);
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- while (!list_empty(&to_free)) {
- io = list_entry(to_free.next, ext4_io_end_t, list);
- list_del_init(&io->list);
ext4_free_io_end(io);
}
return ret;
@@ -219,10 +185,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
-static void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_work(struct work_struct *work)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- ext4_do_flush_completed_IO(io->inode, io);
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unwritten_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode);
}
int ext4_flush_unwritten_io(struct inode *inode)
@@ -230,7 +197,7 @@ int ext4_flush_unwritten_io(struct inode *inode)
int ret;
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
!(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode, NULL);
+ ret = ext4_do_flush_completed_IO(inode);
ext4_unwritten_wait(inode);
return ret;
}
@@ -241,7 +208,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
if (io) {
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
- INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
return io;
@@ -382,14 +348,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
- if (!buffer_mapped(bh) || buffer_delay(bh)) {
- if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
- return 0;
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
@@ -436,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
if (!io_page) {
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
@@ -468,7 +426,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
set_buffer_uptodate(bh);
continue;
}
- clear_buffer_dirty(bh);
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ continue;
+ }
ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
if (ret) {
/*
@@ -476,9 +442,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* we can do but mark the page as dirty, and
* better luck next time.
*/
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
break;
}
+ clear_buffer_dirty(bh);
}
unlock_page(page);
/*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d99387b89ed..c7f4d758466 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
- if (!bh)
- return -EIO;
+ if (unlikely(!bh))
+ return -ENOMEM;
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -466,7 +466,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
/* This transaction may be extended/restarted along the way */
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto out;
}
@@ -1031,7 +1031,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
handle_t *handle;
int err = 0, err2;
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
if (IS_ERR(handle)) {
group = 1;
err = PTR_ERR(handle);
@@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
ext4_bg_has_super(sb, group));
bh = sb_getblk(sb, backup_block);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext4_debug("update metadata backup %llu(+%llu)\n",
@@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
{
struct buffer_head *bh = sb_getblk(sb, block);
- if (!bh)
+ if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -1412,7 +1412,7 @@ static int ext4_flex_group_add(struct super_block *sb,
* modify each of the reserved GDT dindirect blocks.
*/
credit = flex_gd->count * 4 + reserved_gdb;
- handle = ext4_journal_start_sb(sb, credit);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto exit;
@@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
group_data[i].blocks_count = blocks_per_group;
overhead = ext4_group_overhead_blocks(sb, group + i);
group_data[i].free_blocks_count = blocks_per_group - overhead;
- if (ext4_has_group_desc_csum(sb))
+ if (ext4_has_group_desc_csum(sb)) {
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
- else
+ if (!test_opt(sb, INIT_INODE_TABLE))
+ flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+ } else
flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
}
@@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
err = ext4_alloc_flex_bg_array(sb, input->group + 1);
if (err)
- return err;
+ goto out;
err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
if (err)
@@ -1622,7 +1624,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
/* We will update the superblock, one block bitmap, and
* one group descriptor via ext4_group_add_blocks().
*/
- handle = ext4_journal_start_sb(sb, 3);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_warning(sb, "error %d on journal start", err);
@@ -1786,7 +1788,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
credits += 3; /* block bitmap, bg descriptor, resize inode */
}
- handle = ext4_journal_start_sb(sb, credits);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4df78dd3f52..620cf5615ba 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,8 +69,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16]);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -296,107 +294,6 @@ void ext4_itable_unused_set(struct super_block *sb,
}
-/* Just increment the non-pointer handle value */
-static handle_t *ext4_get_nojournal(void)
-{
- handle_t *handle = current->journal_info;
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
-
- ref_cnt++;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
- return handle;
-}
-
-
-/* Decrement the non-pointer handle value */
-static void ext4_put_nojournal(handle_t *handle)
-{
- unsigned long ref_cnt = (unsigned long)handle;
-
- BUG_ON(ref_cnt == 0);
-
- ref_cnt--;
- handle = (handle_t *)ref_cnt;
-
- current->journal_info = handle;
-}
-
-/*
- * Wrappers for jbd2_journal_start/end.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
-{
- journal_t *journal;
-
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
- if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
- WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
- journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
- /*
- * Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly.
- */
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
- return jbd2_journal_start(journal, nblocks);
-}
-
-int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
-{
- struct super_block *sb;
- int err;
- int rc;
-
- if (!ext4_handle_valid(handle)) {
- ext4_put_nojournal(handle);
- return 0;
- }
- sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
- rc = jbd2_journal_stop(handle);
-
- if (!err)
- err = rc;
- if (err)
- __ext4_std_error(sb, where, line, err);
- return err;
-}
-
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn, struct buffer_head *bh,
- handle_t *handle, int err)
-{
- char nbuf[16];
- const char *errstr = ext4_decode_error(NULL, err, nbuf);
-
- BUG_ON(!ext4_handle_valid(handle));
-
- if (bh)
- BUFFER_TRACE(bh, "abort");
-
- if (!handle->h_err)
- handle->h_err = err;
-
- if (is_handle_aborted(handle))
- return;
-
- printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
- caller, line, errstr, err_fn);
-
- jbd2_journal_abort_handle(handle);
-}
-
static void __save_error_info(struct super_block *sb, const char *func,
unsigned int line)
{
@@ -582,8 +479,8 @@ void ext4_error_file(struct file *file, const char *function,
ext4_handle_error(inode->i_sb);
}
-static const char *ext4_decode_error(struct super_block *sb, int errno,
- char nbuf[16])
+const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16])
{
char *errstr = NULL;
@@ -858,6 +755,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -939,11 +837,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_lru_nr = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -960,6 +859,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
+ INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
return &ei->vfs_inode;
}
@@ -1031,6 +931,7 @@ void ext4_clear_inode(struct inode *inode)
dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1280,8 +1181,8 @@ static const match_table_t tokens = {
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
- {Opt_mblk_io_submit, "mblk_io_submit"},
- {Opt_nomblk_io_submit, "nomblk_io_submit"},
+ {Opt_removed, "mblk_io_submit"},
+ {Opt_removed, "nomblk_io_submit"},
{Opt_block_validity, "block_validity"},
{Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1337,6 +1238,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *qname;
+ int ret = -1;
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
@@ -1351,23 +1253,26 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return -1;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
- kfree(qname);
- return -1;
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext4_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- return -1;
+ goto errout;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sb, QUOTA);
return 1;
+errout:
+ kfree(qname);
+ return ret;
}
static int clear_qf_name(struct super_block *sb, int qtype)
@@ -1381,10 +1286,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
" when quota turned on");
return -1;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
+ kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 1;
}
@@ -1404,6 +1306,9 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_QFMT MOPT_NOSUPPORT
#endif
#define MOPT_DATAJ 0x0080
+#define MOPT_NO_EXT2 0x0100
+#define MOPT_NO_EXT3 0x0200
+#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
static const struct mount_opts {
int token;
@@ -1414,25 +1319,31 @@ static const struct mount_opts {
{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
- {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
- {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
- {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
- {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
- {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
- {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
- {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+ MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+ MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
- EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
- {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+ EXT4_MOUNT_JOURNAL_CHECKSUM),
+ MOPT_EXT4_ONLY | MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
- {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
- {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+ MOPT_NO_EXT2 | MOPT_CLEAR},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1444,9 +1355,14 @@ static const struct mount_opts {
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
{Opt_stripe, 0, MOPT_GTE0},
- {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
- {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
- {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+ {Opt_resuid, 0, MOPT_GTE0},
+ {Opt_resgid, 0, MOPT_GTE0},
+ {Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_ioprio, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+ MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1496,8 +1412,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
else if (token == Opt_offgrpjquota)
return clear_qf_name(sb, GRPQUOTA);
#endif
- if (args->from && match_int(args, &arg))
- return -1;
switch (token) {
case Opt_noacl:
case Opt_nouser_xattr:
@@ -1506,138 +1420,149 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_sb:
return 1; /* handled by get_sb_block() */
case Opt_removed:
- ext4_msg(sb, KERN_WARNING,
- "Ignoring removed %s option", opt);
+ ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
+ return 1;
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
return 1;
- case Opt_resuid:
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++)
+ if (token == m->token)
+ break;
+
+ if (m->token == Opt_err) {
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+ }
+
+ if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext2", opt);
+ return -1;
+ }
+ if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Mount option \"%s\" incompatible with ext3", opt);
+ return -1;
+ }
+
+ if (args->from && match_int(args, &arg))
+ return -1;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ if (arg == 0)
+ arg = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks must be "
+ "0 or a power of 2 smaller than 2^31");
+ return -1;
+ }
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (token == Opt_resuid) {
uid = make_kuid(current_user_ns(), arg);
if (!uid_valid(uid)) {
ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
return -1;
}
sbi->s_resuid = uid;
- return 1;
- case Opt_resgid:
+ } else if (token == Opt_resgid) {
gid = make_kgid(current_user_ns(), arg);
if (!gid_valid(gid)) {
ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
return -1;
}
sbi->s_resgid = gid;
- return 1;
- case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
- return 1;
- case Opt_i_version:
- sb->s_flags |= MS_I_VERSION;
- return 1;
- case Opt_journal_dev:
+ } else if (token == Opt_journal_dev) {
if (is_remount) {
ext4_msg(sb, KERN_ERR,
"Cannot specify journal on remount");
return -1;
}
*journal_devnum = arg;
- return 1;
- case Opt_journal_ioprio:
- if (arg < 0 || arg > 7)
- return -1;
- *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
- return 1;
- }
-
- for (m = ext4_mount_opts; m->token != Opt_err; m++) {
- if (token != m->token)
- continue;
- if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
- return -1;
- if (m->flags & MOPT_EXPLICIT)
- set_opt2(sb, EXPLICIT_DELALLOC);
- if (m->flags & MOPT_CLEAR_ERR)
- clear_opt(sb, ERRORS_MASK);
- if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot change quota "
- "options when quota turned on");
+ } else if (token == Opt_journal_ioprio) {
+ if (arg > 7) {
+ ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+ " (must be 0-7)");
return -1;
}
-
- if (m->flags & MOPT_NOSUPPORT) {
- ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
- } else if (token == Opt_commit) {
- if (arg == 0)
- arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * arg;
- } else if (token == Opt_max_batch_time) {
- if (arg == 0)
- arg = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_max_batch_time = arg;
- } else if (token == Opt_min_batch_time) {
- sbi->s_min_batch_time = arg;
- } else if (token == Opt_inode_readahead_blks) {
- if (arg > (1 << 30))
- return -1;
- if (arg && !is_power_of_2(arg)) {
+ *journal_ioprio =
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (m->flags & MOPT_DATAJ) {
+ if (is_remount) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
ext4_msg(sb, KERN_ERR,
- "EXT4-fs: inode_readahead_blks"
- " must be a power of 2");
- return -1;
- }
- sbi->s_inode_readahead_blks = arg;
- } else if (token == Opt_init_itable) {
- set_opt(sb, INIT_INODE_TABLE);
- if (!args->from)
- arg = EXT4_DEF_LI_WAIT_MULT;
- sbi->s_li_wait_mult = arg;
- } else if (token == Opt_max_dir_size_kb) {
- sbi->s_max_dir_size_kb = arg;
- } else if (token == Opt_stripe) {
- sbi->s_stripe = arg;
- } else if (m->flags & MOPT_DATAJ) {
- if (is_remount) {
- if (!sbi->s_journal)
- ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
- else if (test_opt(sb, DATA_FLAGS) !=
- m->mount_opt) {
- ext4_msg(sb, KERN_ERR,
"Cannot change data mode on remount");
- return -1;
- }
- } else {
- clear_opt(sb, DATA_FLAGS);
- sbi->s_mount_opt |= m->mount_opt;
- }
-#ifdef CONFIG_QUOTA
- } else if (m->flags & MOPT_QFMT) {
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != m->mount_opt) {
- ext4_msg(sb, KERN_ERR, "Cannot "
- "change journaled quota options "
- "when quota turned on");
return -1;
}
- sbi->s_jquota_fmt = m->mount_opt;
-#endif
} else {
- if (!args->from)
- arg = 1;
- if (m->flags & MOPT_CLEAR)
- arg = !arg;
- else if (unlikely(!(m->flags & MOPT_SET))) {
- ext4_msg(sb, KERN_WARNING,
- "buggy handling of option %s", opt);
- WARN_ON(1);
- return -1;
- }
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_mount_opt |= m->mount_opt;
}
- return 1;
+#ifdef CONFIG_QUOTA
+ } else if (m->flags & MOPT_QFMT) {
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ sbi->s_jquota_fmt = m->mount_opt;
+#endif
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
+ }
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
}
- ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
- "or missing value", opt);
- return -1;
+ return 1;
}
static int parse_options(char *options, struct super_block *sb,
@@ -2776,7 +2701,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
break;
}
- if (group == ngroups)
+ if (group >= ngroups)
ret = 1;
if (!ret) {
@@ -3016,33 +2941,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return elr;
}
-static int ext4_register_li_request(struct super_block *sb,
- ext4_group_t first_not_zeroed)
+int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_li_request *elr;
+ struct ext4_li_request *elr = NULL;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0;
+ mutex_lock(&ext4_li_mtx);
if (sbi->s_li_request != NULL) {
/*
* Reset timeout so it can be computed again, because
* s_li_wait_mult might have changed.
*/
sbi->s_li_request->lr_timeout = 0;
- return 0;
+ goto out;
}
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
!test_opt(sb, INIT_INODE_TABLE))
- return 0;
+ goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
- if (!elr)
- return -ENOMEM;
-
- mutex_lock(&ext4_li_mtx);
+ if (!elr) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (NULL == ext4_li_info) {
ret = ext4_li_info_new();
@@ -3379,7 +3305,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
- set_opt(sb, MBLK_IO_SUBMIT);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3772,6 +3697,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sb);
+
/*
* set up enough so that it can read an inode
*/
@@ -4008,7 +3936,7 @@ no_journal:
!(sb->s_flags & MS_RDONLY)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount7;
+ goto failed_mount8;
}
#endif /* CONFIG_QUOTA */
@@ -4035,6 +3963,10 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+#ifdef CONFIG_QUOTA
+failed_mount8:
+ kobject_del(&sbi->s_kobj);
+#endif
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4476,16 +4408,12 @@ static void ext4_clear_journal_err(struct super_block *sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret = 0;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
- ret = ext4_journal_force_commit(journal);
-
- return ret;
+ return ext4_journal_force_commit(journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -4588,7 +4516,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
#ifdef CONFIG_QUOTA
- int i;
+ int i, j;
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -4604,7 +4532,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
@@ -4737,9 +4674,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -4768,9 +4703,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
@@ -4835,7 +4768,7 @@ static int ext4_write_dquot(struct dquot *dquot)
struct inode *inode;
inode = dquot_to_inode(dquot);
- handle = ext4_journal_start(inode,
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4851,7 +4784,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4867,7 +4800,7 @@ static int ext4_release_dquot(struct dquot *dquot)
int ret, err;
handle_t *handle;
- handle = ext4_journal_start(dquot_to_inode(dquot),
+ handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
@@ -4899,7 +4832,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -5005,9 +4938,9 @@ static int ext4_enable_quotas(struct super_block *sb)
DQUOT_USAGE_ENABLED);
if (err) {
ext4_warning(sb,
- "Failed to enable quota (type=%d) "
- "tracking. Please run e2fsck to fix.",
- type);
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
return err;
}
}
@@ -5045,7 +4978,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
/* Update modification times of quota files when userspace can
* start looking at them */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
goto out;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a91ebc2b66..3a120b27724 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -549,7 +549,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
error = ext4_handle_dirty_xattr_block(handle, inode, bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
}
@@ -832,7 +832,8 @@ inserted:
else {
/* The old block is released after updating
the inode. */
- error = dquot_alloc_block(inode, 1);
+ error = dquot_alloc_block(inode,
+ EXT4_C2B(EXT4_SB(sb), 1));
if (error)
goto cleanup;
error = ext4_journal_get_write_access(handle,
@@ -886,17 +887,18 @@ inserted:
(unsigned long long)block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
+ error = -ENOMEM;
getblk_failed:
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA);
- error = -EIO;
goto cleanup;
}
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
unlock_buffer(new_bh);
+ error = -EIO;
goto getblk_failed;
}
memcpy(new_bh->b_data, s->base, new_bh->b_size);
@@ -928,7 +930,7 @@ cleanup:
return error;
cleanup_dquot:
- dquot_free_block(inode, 1);
+ dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
goto cleanup;
bad_block:
@@ -1164,17 +1166,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
- int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+ int credits = ext4_jbd2_credits_xattr(inode);
retry:
- /*
- * In case of inline data, we may push out the data to a block,
- * So reserve the journal space first.
- */
- if (ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
-
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69eda787a96..aa25deb5c6c 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -125,74 +125,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
-extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
-extern int ext4_get_max_inline_size(struct inode *inode);
-extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
- struct ext4_iloc *iloc,
- void *buffer, loff_t pos,
- unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
-extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
-
-extern int ext4_readpage_inline(struct inode *inode, struct page *page);
-extern int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- unsigned flags,
- struct page **pagep,
- void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
- unsigned len, unsigned copied,
- struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
-extern int ext4_try_create_inline_dir(handle_t *handle,
- struct inode *parent,
- struct inode *inode);
-extern int ext4_read_inline_dir(struct file *filp,
- void *dirent, filldir_t filldir,
- int *has_inline_data);
-extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
- int *has_inline_data);
-extern int ext4_delete_inline_entry(handle_t *handle,
- struct inode *dir,
- struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh,
- int *has_inline_data);
-extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
-extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
- struct ext4_dir_entry_2 **parent_de,
- int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline);
-extern int ext4_try_to_evict_inline_data(handle_t *handle,
- struct inode *inode,
- int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
-
-extern int ext4_convert_inline_data(struct inode *inode);
-
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index e95b94945d5..137af4255da 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
retval = f2fs_getxattr(inode, name_index, "", value, retval);
}
- if (retval < 0) {
- if (retval == -ENODATA)
- acl = NULL;
- else
- acl = ERR_PTR(retval);
- } else {
+ if (retval > 0)
acl = f2fs_acl_from_disk(value, retval);
- }
+ else if (retval == -ENODATA)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
kfree(value);
+
if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6ef36c37e2b..2b6fc131e2c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,22 +72,22 @@ static int f2fs_write_meta_page(struct page *page,
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int err;
- wait_on_page_writeback(page);
-
- err = write_meta_page(sbi, page, wbc);
- if (err) {
+ /* Should not write any meta pages, if any IO error was occurred */
+ if (wbc->for_reclaim ||
+ is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+ dec_page_count(sbi, F2FS_DIRTY_META);
wbc->pages_skipped++;
set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
}
- dec_page_count(sbi, F2FS_DIRTY_META);
+ wait_on_page_writeback(page);
- /* In this case, we should not unlock this page */
- if (err != AOP_WRITEPAGE_ACTIVATE)
- unlock_page(page);
- return err;
+ write_meta_page(sbi, page);
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ unlock_page(page);
+ return 0;
}
static int f2fs_write_meta_pages(struct address_space *mapping,
@@ -138,7 +138,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
BUG_ON(page->mapping != mapping);
BUG_ON(!PageDirty(page));
clear_page_dirty_for_io(page);
- f2fs_write_meta_page(page, &wbc);
+ if (f2fs_write_meta_page(page, &wbc)) {
+ unlock_page(page);
+ break;
+ }
if (nwritten++ >= nr_to_write)
break;
}
@@ -161,7 +164,6 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(sbi, F2FS_DIRTY_META);
- F2FS_SET_SB_DIRT(sbi);
return 1;
}
return 0;
@@ -214,22 +216,13 @@ retry:
goto retry;
}
new->ino = ino;
- INIT_LIST_HEAD(&new->list);
/* add new_oentry into list which is sorted by inode number */
- if (orphan) {
- struct orphan_inode_entry *prev;
-
- /* get previous entry */
- prev = list_entry(orphan->list.prev, typeof(*prev), list);
- if (&prev->list != head)
- /* insert new orphan inode entry */
- list_add(&new->list, &prev->list);
- else
- list_add(&new->list, head);
- } else {
+ if (orphan)
+ list_add(&new->list, this->prev);
+ else
list_add_tail(&new->list, head);
- }
+
sbi->n_orphans++;
out:
mutex_unlock(&sbi->orphan_inode_mutex);
@@ -546,7 +539,7 @@ retry:
/*
* Freeze all the FS-operations for checkpoint.
*/
-void block_operations(struct f2fs_sb_info *sbi)
+static void block_operations(struct f2fs_sb_info *sbi)
{
int t;
struct writeback_control wbc = {
@@ -718,27 +711,24 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
sbi->alloc_valid_block_count = 0;
/* Here, we only have one bio having CP pack */
- if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
- sbi->sb->s_flags |= MS_RDONLY;
- else
- sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+ sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- clear_prefree_segments(sbi);
- F2FS_RESET_SB_DIRT(sbi);
+ if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
+ }
}
/*
* We guarantee that this checkpoint procedure should not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
- if (!blocked) {
- mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
- }
+ mutex_lock(&sbi->cp_mutex);
+ block_operations(sbi);
f2fs_submit_bio(sbi, DATA, true);
f2fs_submit_bio(sbi, NODE, true);
@@ -772,7 +762,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
sbi->n_orphans = 0;
}
-int create_checkpoint_caches(void)
+int __init create_checkpoint_caches(void)
{
orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
sizeof(struct orphan_inode_entry), NULL);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 3aa5ce7cab8..7bd22a20112 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -547,6 +547,15 @@ redirty_out:
#define MAX_DESIRED_PAGES_WP 4096
+static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -563,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
if (!S_ISDIR(inode->i_mode))
mutex_lock(&sbi->writepages);
- ret = generic_writepages(mapping, wbc);
+ ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
if (!S_ISDIR(inode->i_mode))
mutex_unlock(&sbi->writepages);
f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
@@ -689,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page)
return 0;
}
+static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
+{
+ return generic_block_bmap(mapping, block, get_data_block_ro);
+}
+
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
.readpages = f2fs_read_data_pages,
@@ -700,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = {
.invalidatepage = f2fs_invalidate_data_page,
.releasepage = f2fs_release_data_page,
.direct_IO = f2fs_direct_IO,
+ .bmap = f2fs_bmap,
};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0e0380a588a..025b9e2f935 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -26,6 +26,7 @@
static LIST_HEAD(f2fs_stat_list);
static struct dentry *debugfs_root;
+static DEFINE_MUTEX(f2fs_stat_mutex);
static void update_general_status(struct f2fs_sb_info *sbi)
{
@@ -180,18 +181,16 @@ static int stat_show(struct seq_file *s, void *v)
int i = 0;
int j;
+ mutex_lock(&f2fs_stat_mutex);
list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ char devname[BDEVNAME_SIZE];
- mutex_lock(&si->stat_lock);
- if (!si->sbi) {
- mutex_unlock(&si->stat_lock);
- continue;
- }
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
- seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ",
- si->nat_area_segs, si->sit_area_segs);
+ seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
+ bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
+ si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
@@ -286,8 +285,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
(si->base_mem + si->cache_mem) >> 10,
si->base_mem >> 10, si->cache_mem >> 10);
- mutex_unlock(&si->stat_lock);
}
+ mutex_unlock(&f2fs_stat_mutex);
return 0;
}
@@ -303,7 +302,7 @@ static const struct file_operations stat_fops = {
.release = single_release,
};
-static int init_stats(struct f2fs_sb_info *sbi)
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
@@ -313,9 +312,6 @@ static int init_stats(struct f2fs_sb_info *sbi)
return -ENOMEM;
si = sbi->stat_info;
- mutex_init(&si->stat_lock);
- list_add_tail(&si->stat_list, &f2fs_stat_list);
-
si->all_area_segs = le32_to_cpu(raw_super->segment_count);
si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -325,21 +321,11 @@ static int init_stats(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
si->sbi = sbi;
- return 0;
-}
-int f2fs_build_stats(struct f2fs_sb_info *sbi)
-{
- int retval;
-
- retval = init_stats(sbi);
- if (retval)
- return retval;
-
- if (!debugfs_root)
- debugfs_root = debugfs_create_dir("f2fs", NULL);
+ mutex_lock(&f2fs_stat_mutex);
+ list_add_tail(&si->stat_list, &f2fs_stat_list);
+ mutex_unlock(&f2fs_stat_mutex);
- debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops);
return 0;
}
@@ -347,14 +333,22 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = sbi->stat_info;
+ mutex_lock(&f2fs_stat_mutex);
list_del(&si->stat_list);
- mutex_lock(&si->stat_lock);
- si->sbi = NULL;
- mutex_unlock(&si->stat_lock);
+ mutex_unlock(&f2fs_stat_mutex);
+
kfree(sbi->stat_info);
}
-void destroy_root_stats(void)
+void __init f2fs_create_root_stats(void)
+{
+ debugfs_root = debugfs_create_dir("f2fs", NULL);
+ if (debugfs_root)
+ debugfs_create_file("status", S_IRUGO, debugfs_root,
+ NULL, &stat_fops);
+}
+
+void f2fs_destroy_root_stats(void)
{
debugfs_remove_recursive(debugfs_root);
debugfs_root = NULL;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fda0bcc0907..a1f38443ece 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -265,7 +265,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
mutex_unlock_op(sbi, DENTRY_OPS);
}
-void init_dent_inode(struct dentry *dentry, struct page *ipage)
+void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_node *rn;
@@ -274,20 +274,19 @@ void init_dent_inode(struct dentry *dentry, struct page *ipage)
wait_on_page_writeback(ipage);
- /* copy dentry info. to this inode page */
+ /* copy name info. to this inode page */
rn = (struct f2fs_node *)page_address(ipage);
- rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
- memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
+ rn->i.i_namelen = cpu_to_le32(name->len);
+ memcpy(rn->i.i_name, name->name, name->len);
set_page_dirty(ipage);
}
-static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
+static int init_inode_metadata(struct inode *inode,
+ struct inode *dir, const struct qstr *name)
{
- struct inode *dir = dentry->d_parent->d_inode;
-
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
int err;
- err = new_inode_page(inode, dentry);
+ err = new_inode_page(inode, name);
if (err)
return err;
@@ -310,7 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
if (IS_ERR(ipage))
return PTR_ERR(ipage);
set_cold_node(inode, ipage);
- init_dent_inode(dentry, ipage);
+ init_dent_inode(name, ipage);
f2fs_put_page(ipage, 1);
}
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
@@ -371,7 +370,7 @@ next:
goto next;
}
-int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
{
unsigned int bit_pos;
unsigned int level;
@@ -380,17 +379,15 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode)
f2fs_hash_t dentry_hash;
struct f2fs_dir_entry *de;
unsigned int nbucket, nblock;
- struct inode *dir = dentry->d_parent->d_inode;
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- const char *name = dentry->d_name.name;
- size_t namelen = dentry->d_name.len;
+ size_t namelen = name->len;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
int slots = GET_DENTRY_SLOTS(namelen);
int err = 0;
int i;
- dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
+ dentry_hash = f2fs_dentry_hash(name->name, name->len);
level = 0;
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
@@ -433,7 +430,7 @@ start:
++level;
goto start;
add_dentry:
- err = init_inode_metadata(inode, dentry);
+ err = init_inode_metadata(inode, dir, name);
if (err)
goto fail;
@@ -442,7 +439,7 @@ add_dentry:
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
- memcpy(dentry_blk->filename[bit_pos], name, namelen);
+ memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
for (i = 0; i < slots; i++)
@@ -503,7 +500,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
}
if (inode) {
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ inode->i_ctime = CURRENT_TIME;
drop_nlink(inode);
if (S_ISDIR(inode->i_mode)) {
drop_nlink(inode);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 13c6dfbb718..cc2213afdcc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -104,6 +104,20 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
}
/*
+ * ioctl commands
+ */
+#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#endif
+
+/*
* For INODE and NODE manager
*/
#define XATTR_NODE_OFFSET (-1) /*
@@ -141,7 +155,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
- unsigned long long data_version;/* lastes version of data for fsync */
+ unsigned long long data_version;/* latest version of data for fsync */
atomic_t dirty_dents; /* # of dirty dentry pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -211,11 +225,11 @@ struct dnode_of_data {
static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
struct page *ipage, struct page *npage, nid_t nid)
{
+ memset(dn, 0, sizeof(*dn));
dn->inode = inode;
dn->inode_page = ipage;
dn->node_page = npage;
dn->nid = nid;
- dn->inode_page_locked = 0;
}
/*
@@ -573,6 +587,14 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
+static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
+{
+ unsigned int pages_per_sec = sbi->segs_per_sec *
+ (1 << sbi->log_blocks_per_seg);
+ return ((get_pages(sbi, block_type) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+}
+
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
{
block_t ret;
@@ -842,12 +864,12 @@ void f2fs_truncate(struct inode *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
/*
* inode.c
*/
void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
struct inode *f2fs_iget(struct super_block *, unsigned long);
void update_inode(struct inode *, struct page *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
@@ -867,16 +889,24 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-void init_dent_inode(struct dentry *, struct page *);
-int f2fs_add_link(struct dentry *, struct inode *);
+void init_dent_inode(const struct qstr *, struct page *);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
bool f2fs_empty_dir(struct inode *);
+static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
+ inode);
+}
+
/*
* super.c
*/
int f2fs_sync_fs(struct super_block *, int);
+extern __printf(3, 4)
+void f2fs_msg(struct super_block *, const char *, const char *, ...);
/*
* hash.c
@@ -894,7 +924,7 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, struct dentry *);
+int new_inode_page(struct inode *, const struct qstr *);
struct page *new_node_page(struct dnode_of_data *, unsigned int);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
@@ -912,7 +942,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int,
void flush_nat_entries(struct f2fs_sb_info *);
int build_node_manager(struct f2fs_sb_info *);
void destroy_node_manager(struct f2fs_sb_info *);
-int create_node_manager_caches(void);
+int __init create_node_manager_caches(void);
void destroy_node_manager_caches(void);
/*
@@ -927,8 +957,7 @@ void allocate_new_segments(struct f2fs_sb_info *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
struct bio *f2fs_bio_alloc(struct block_device *, int);
void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
-int write_meta_page(struct f2fs_sb_info *, struct page *,
- struct writeback_control *);
+void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
block_t, block_t *);
void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
@@ -961,10 +990,9 @@ int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
void remove_dirty_dir_inode(struct inode *);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void block_operations(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool, bool);
+void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
-int create_checkpoint_caches(void);
+int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
/*
@@ -984,9 +1012,9 @@ int do_write_data_page(struct page *);
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int);
-int f2fs_gc(struct f2fs_sb_info *, int);
+int f2fs_gc(struct f2fs_sb_info *);
void build_gc_manager(struct f2fs_sb_info *);
-int create_gc_caches(void);
+int __init create_gc_caches(void);
void destroy_gc_caches(void);
/*
@@ -1058,7 +1086,8 @@ struct f2fs_stat_info {
int f2fs_build_stats(struct f2fs_sb_info *);
void f2fs_destroy_stats(struct f2fs_sb_info *);
-void destroy_root_stats(void);
+void __init f2fs_create_root_stats(void);
+void f2fs_destroy_root_stats(void);
#else
#define stat_inc_call_count(si)
#define stat_inc_seg_count(si, type)
@@ -1068,7 +1097,8 @@ void destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void destroy_root_stats(void) { }
+static inline void __init f2fs_create_root_stats(void) { }
+static inline void f2fs_destroy_root_stats(void) { }
#endif
extern const struct file_operations f2fs_dir_operations;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7f9ea9271eb..b7a053d4c6d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/types.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <linux/mount.h>
@@ -96,8 +97,9 @@ out:
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
- .fault = filemap_fault,
- .page_mkwrite = f2fs_vm_page_mkwrite,
+ .fault = filemap_fault,
+ .page_mkwrite = f2fs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
@@ -137,6 +139,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
+ /* guarantee free sections for fsync */
+ f2fs_balance_fs(sbi);
+
mutex_lock(&inode->i_mutex);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
@@ -153,11 +158,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+ else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
need_cp = true;
- if (!space_for_roll_forward(sbi))
+ else if (!space_for_roll_forward(sbi))
need_cp = true;
- if (need_to_sync_dir(sbi, inode))
+ else if (need_to_sync_dir(sbi, inode))
need_cp = true;
if (need_cp) {
@@ -294,8 +299,6 @@ void f2fs_truncate(struct inode *inode)
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
-
- f2fs_balance_fs(F2FS_SB(inode->i_sb));
}
static int f2fs_getattr(struct vfsmount *mnt,
@@ -352,6 +355,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_SB(inode->i_sb));
}
__setattr_copy(inode, attr);
@@ -383,12 +387,17 @@ const struct inode_operations f2fs_file_inode_operations = {
static void fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
if (!len)
return;
+ f2fs_balance_fs(sbi);
+
+ mutex_lock_op(sbi, DATA_NEW);
page = get_new_data_page(inode, index, false);
+ mutex_unlock_op(sbi, DATA_NEW);
if (!IS_ERR(page)) {
wait_on_page_writeback(page);
@@ -407,6 +416,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ f2fs_balance_fs(sbi);
+
mutex_lock_op(sbi, DATA_TRUNC);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, RDONLY_NODE);
@@ -534,7 +545,6 @@ static long f2fs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file->f_path.dentry->d_inode;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
long ret;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -545,7 +555,10 @@ static long f2fs_fallocate(struct file *file, int mode,
else
ret = expand_inode_data(inode, offset, len, mode);
- f2fs_balance_fs(sbi);
+ if (!ret) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ }
return ret;
}
@@ -622,6 +635,23 @@ out:
}
}
+#ifdef CONFIG_COMPAT
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case F2FS_IOC32_GETFLAGS:
+ cmd = F2FS_IOC_GETFLAGS;
+ break;
+ case F2FS_IOC32_SETFLAGS:
+ cmd = F2FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
const struct file_operations f2fs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -633,6 +663,9 @@ const struct file_operations f2fs_file_operations = {
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = f2fs_compat_ioctl,
+#endif
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b0ec721e984..94b8a0c4845 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -44,10 +44,10 @@ static int gc_thread_func(void *data)
if (kthread_should_stop())
break;
- f2fs_balance_fs(sbi);
-
- if (!test_opt(sbi, BG_GC))
+ if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
+ wait_ms = GC_THREAD_MAX_SLEEP_TIME;
continue;
+ }
/*
* [GC triggering condition]
@@ -78,7 +78,8 @@ static int gc_thread_func(void *data)
sbi->bg_gc++;
- if (f2fs_gc(sbi, 1) == GC_NONE)
+ /* if return value is not zero, no victim was selected */
+ if (f2fs_gc(sbi))
wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
wait_ms = GC_THREAD_MAX_SLEEP_TIME;
@@ -90,7 +91,10 @@ static int gc_thread_func(void *data)
int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ if (!test_opt(sbi, BG_GC))
+ return 0;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th)
return -ENOMEM;
@@ -98,9 +102,10 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
- GC_THREAD_NAME);
+ "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
kfree(gc_th);
+ sbi->gc_thread = NULL;
return -ENOMEM;
}
return 0;
@@ -141,6 +146,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
struct victim_sel_policy *p)
{
+ /* SSR allocates in a segment unit */
+ if (p->alloc_mode == SSR)
+ return 1 << sbi->log_blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
else if (p->gc_mode == GC_CB)
@@ -356,7 +364,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
ret = f2fs_test_bit(offset, sentry->cur_valid_map);
mutex_unlock(&sit_i->sentry_lock);
- return ret ? GC_OK : GC_NEXT;
+ return ret;
}
/*
@@ -364,7 +372,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
bool initial = true;
@@ -376,21 +384,12 @@ next_step:
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
- int err;
- /*
- * It makes sure that free segments are able to write
- * all the dirty node pages before CP after this CP.
- * So let's check the space of dirty node pages.
- */
- if (should_do_checkpoint(sbi)) {
- mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
- return GC_BLOCKED;
- }
+ /* stop BG_GC if there is not enough free sections. */
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ return;
- err = check_valid_map(sbi, segno, off);
- if (err == GC_NEXT)
+ if (check_valid_map(sbi, segno, off) == 0)
continue;
if (initial) {
@@ -420,11 +419,14 @@ next_step:
};
sync_node_pages(sbi, 0, &wbc);
}
- return GC_DONE;
}
/*
- * Calculate start block index that this node page contains
+ * Calculate start block index indicating the given node offset.
+ * Be careful, caller should give this node offset only indicating direct node
+ * blocks. If any node offsets, which point the other types of node blocks such
+ * as indirect or double indirect node blocks, are given, it must be a caller's
+ * bug.
*/
block_t start_bidx_of_node(unsigned int node_ofs)
{
@@ -459,13 +461,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
- return GC_NEXT;
+ return 0;
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
f2fs_put_page(node_page, 1);
- return GC_NEXT;
+ return 0;
}
*nofs = ofs_of_node(node_page);
@@ -473,8 +475,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
- return GC_NEXT;
- return GC_OK;
+ return 0;
+ return 1;
}
static void move_data_page(struct inode *inode, struct page *page, int gc_type)
@@ -515,13 +517,13 @@ out:
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct list_head *ilist, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
block_t start_addr;
- int err, off;
+ int off;
int phase = 0;
start_addr = START_BLOCK(sbi, segno);
@@ -535,20 +537,11 @@ next_step:
unsigned int ofs_in_node, nofs;
block_t start_bidx;
- /*
- * It makes sure that free segments are able to write
- * all the dirty node pages before CP after this CP.
- * So let's check the space of dirty node pages.
- */
- if (should_do_checkpoint(sbi)) {
- mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
- err = GC_BLOCKED;
- goto stop;
- }
+ /* stop BG_GC if there is not enough free sections. */
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ return;
- err = check_valid_map(sbi, segno, off);
- if (err == GC_NEXT)
+ if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
@@ -557,8 +550,7 @@ next_step:
}
/* Get an inode by ino with checking validity */
- err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
- if (err == GC_NEXT)
+ if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
continue;
if (phase == 1) {
@@ -570,7 +562,7 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 2) {
- inode = f2fs_iget_nowait(sb, dni.ino);
+ inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode))
continue;
@@ -598,11 +590,9 @@ next_iput:
}
if (++phase < 4)
goto next_step;
- err = GC_DONE;
-stop:
+
if (gc_type == FG_GC)
f2fs_submit_bio(sbi, DATA, true);
- return err;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -616,17 +606,16 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
return ret;
}
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
struct list_head *ilist, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
- int ret = GC_DONE;
/* read segment summary of victim */
sum_page = get_sum_page(sbi, segno);
if (IS_ERR(sum_page))
- return GC_ERROR;
+ return;
/*
* CP needs to lock sum_page. In this time, we don't need
@@ -638,76 +627,55 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
switch (GET_SUM_TYPE((&sum->footer))) {
case SUM_TYPE_NODE:
- ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
+ gc_node_segment(sbi, sum->entries, segno, gc_type);
break;
case SUM_TYPE_DATA:
- ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+ gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
break;
}
stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
stat_inc_call_count(sbi->stat_info);
f2fs_put_page(sum_page, 0);
- return ret;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, int nGC)
+int f2fs_gc(struct f2fs_sb_info *sbi)
{
- unsigned int segno;
- int old_free_secs, cur_free_secs;
- int gc_status, nfree;
struct list_head ilist;
+ unsigned int segno, i;
int gc_type = BG_GC;
+ int nfree = 0;
+ int ret = -1;
INIT_LIST_HEAD(&ilist);
gc_more:
- nfree = 0;
- gc_status = GC_NONE;
+ if (!(sbi->sb->s_flags & MS_ACTIVE))
+ goto stop;
- if (has_not_enough_free_secs(sbi))
- old_free_secs = reserved_sections(sbi);
- else
- old_free_secs = free_sections(sbi);
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree))
+ gc_type = FG_GC;
- while (sbi->sb->s_flags & MS_ACTIVE) {
- int i;
- if (has_not_enough_free_secs(sbi))
- gc_type = FG_GC;
+ if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+ goto stop;
+ ret = 0;
- cur_free_secs = free_sections(sbi) + nfree;
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ do_garbage_collect(sbi, segno + i, &ilist, gc_type);
- /* We got free space successfully. */
- if (nGC < cur_free_secs - old_free_secs)
- break;
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+ nfree++;
- if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
- break;
+ if (has_not_enough_free_secs(sbi, nfree))
+ goto gc_more;
- for (i = 0; i < sbi->segs_per_sec; i++) {
- /*
- * do_garbage_collect will give us three gc_status:
- * GC_ERROR, GC_DONE, and GC_BLOCKED.
- * If GC is finished uncleanly, we have to return
- * the victim to dirty segment list.
- */
- gc_status = do_garbage_collect(sbi, segno + i,
- &ilist, gc_type);
- if (gc_status != GC_DONE)
- goto stop;
- nfree++;
- }
- }
+ if (gc_type == FG_GC)
+ write_checkpoint(sbi, false);
stop:
- if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) {
- write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
- if (nfree)
- goto gc_more;
- }
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&ilist);
- BUG_ON(!list_empty(&ilist));
- return gc_status;
+ return ret;
}
void build_gc_manager(struct f2fs_sb_info *sbi)
@@ -715,7 +683,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
DIRTY_I(sbi)->v_ops = &default_v_ops;
}
-int create_gc_caches(void)
+int __init create_gc_caches(void)
{
winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
sizeof(struct inode_entry), NULL);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b026d9354cc..30b2db003ac 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -8,7 +8,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-#define GC_THREAD_NAME "f2fs_gc_task"
#define GC_THREAD_MIN_WB_PAGES 1 /*
* a threshold to determine
* whether IO subsystem is idle
@@ -23,15 +22,6 @@
/* Search max. number of dirty segments to select a victim segment */
#define MAX_VICTIM_SEARCH 20
-enum {
- GC_NONE = 0,
- GC_ERROR,
- GC_OK,
- GC_NEXT,
- GC_BLOCKED,
- GC_DONE,
-};
-
struct f2fs_gc_kthread {
struct task_struct *f2fs_gc_task;
wait_queue_head_t gc_wait_queue_head;
@@ -104,14 +94,3 @@ static inline int is_idle(struct f2fs_sb_info *sbi)
struct request_list *rl = &q->root_rl;
return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
}
-
-static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
-{
- unsigned int pages_per_sec = sbi->segs_per_sec *
- (1 << sbi->log_blocks_per_seg);
- int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
- int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
- return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
-}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index bf20b4d0321..ddae412d30c 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,11 +16,6 @@
#include "f2fs.h"
#include "node.h"
-struct f2fs_iget_args {
- u64 ino;
- int on_free;
-};
-
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -40,34 +35,6 @@ void f2fs_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
-static int f2fs_iget_test(struct inode *inode, void *data)
-{
- struct f2fs_iget_args *args = data;
-
- if (inode->i_ino != args->ino)
- return 0;
- if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
- args->on_free = 1;
- return 0;
- }
- return 1;
-}
-
-struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
-{
- struct f2fs_iget_args args = {
- .ino = ino,
- .on_free = 0
- };
- struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
-
- if (inode)
- return inode;
- if (!args.on_free)
- return f2fs_iget(sb, ino);
- return ERR_PTR(-ENOENT);
-}
-
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -100,6 +67,10 @@ static int do_read_inode(struct inode *inode)
inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
+ if (ri->i_addr[0])
+ inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ else
+ inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
@@ -203,6 +174,20 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
ri->i_generation = cpu_to_le32(inode->i_generation);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ ri->i_addr[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ ri->i_addr[1] = 0;
+ } else {
+ ri->i_addr[0] = 0;
+ ri->i_addr[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ ri->i_addr[2] = 0;
+ }
+ }
+
set_cold_node(inode, node_page);
set_page_dirty(node_page);
}
@@ -217,6 +202,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
+ if (wbc)
+ f2fs_balance_fs(sbi);
+
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
@@ -257,6 +245,7 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
i_size_write(inode, 0);
@@ -264,6 +253,7 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_truncate(inode);
remove_inode_page(inode);
+ sb_end_intwrite(inode->i_sb);
no_delete:
clear_inode(inode);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5066bfd256c..e275218904e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -104,7 +104,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
f2fs_put_page(page, 1);
continue;
}
- page_cache_release(page);
+ f2fs_put_page(page, 0);
}
}
@@ -660,7 +660,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
int err = 0, cont = 1;
int level, offset[4], noffset[4];
- unsigned int nofs;
+ unsigned int nofs = 0;
struct f2fs_node *rn;
struct dnode_of_data dn;
struct page *page;
@@ -780,7 +780,7 @@ int remove_inode_page(struct inode *inode)
return 0;
}
-int new_inode_page(struct inode *inode, struct dentry *dentry)
+int new_inode_page(struct inode *inode, const struct qstr *name)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
@@ -790,7 +790,7 @@ int new_inode_page(struct inode *inode, struct dentry *dentry)
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
mutex_lock_op(sbi, NODE_NEW);
page = new_node_page(&dn, 0);
- init_dent_inode(dentry, page);
+ init_dent_inode(name, page);
mutex_unlock_op(sbi, NODE_NEW);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -874,15 +874,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
return;
if (read_node_page(apage, READA))
- goto unlock_out;
+ unlock_page(apage);
- page_cache_release(apage);
- return;
-
-unlock_out:
- unlock_page(apage);
release_out:
- page_cache_release(apage);
+ f2fs_put_page(apage, 0);
+ return;
}
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1124,6 +1120,12 @@ static int f2fs_write_node_page(struct page *page,
return 0;
}
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * Be default, 512 pages (2MB), a segment size, is quite reasonable.
+ */
+#define COLLECT_DIRTY_NODES 512
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1131,17 +1133,16 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct block_device *bdev = sbi->sb->s_bdev;
long nr_to_write = wbc->nr_to_write;
- if (wbc->for_kupdate)
- return 0;
-
- if (get_pages(sbi, F2FS_DIRTY_NODES) == 0)
- return 0;
-
+ /* First check balancing cached NAT entries */
if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
- write_checkpoint(sbi, false, false);
+ write_checkpoint(sbi, false);
return 0;
}
+ /* collect a number of dirty node pages and write together */
+ if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
+ return 0;
+
/* if mounting is failed, skip writing node pages */
wbc->nr_to_write = bio_get_nr_vecs(bdev);
sync_node_pages(sbi, 0, wbc);
@@ -1732,7 +1733,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
kfree(nm_i);
}
-int create_node_manager_caches(void)
+int __init create_node_manager_caches(void)
{
nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
sizeof(struct nat_entry), NULL);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b571fee677d..b235215ac13 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -42,7 +42,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
{
struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
struct f2fs_inode *raw_inode = &(raw_node->i);
- struct dentry dent, parent;
+ struct qstr name;
struct f2fs_dir_entry *de;
struct page *page;
struct inode *dir;
@@ -57,17 +57,15 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
goto out;
}
- parent.d_inode = dir;
- dent.d_parent = &parent;
- dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
- dent.d_name.name = raw_inode->i_name;
+ name.len = le32_to_cpu(raw_inode->i_namelen);
+ name.name = raw_inode->i_name;
- de = f2fs_find_entry(dir, &dent.d_name, &page);
+ de = f2fs_find_entry(dir, &name, &page);
if (de) {
kunmap(page);
f2fs_put_page(page, 0);
} else {
- f2fs_add_link(&dent, inode);
+ err = __f2fs_add_link(dir, &name, inode);
}
iput(dir);
out:
@@ -151,7 +149,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto out;
}
- INIT_LIST_HEAD(&entry->list);
list_add_tail(&entry->list, head);
entry->blkaddr = blkaddr;
}
@@ -174,10 +171,9 @@ out:
static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
struct list_head *head)
{
- struct list_head *this;
- struct fsync_inode_entry *entry;
- list_for_each(this, head) {
- entry = list_entry(this, struct fsync_inode_entry, list);
+ struct fsync_inode_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, list) {
iput(entry->inode);
list_del(&entry->list);
kmem_cache_free(fsync_entry_slab, entry);
@@ -228,7 +224,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
- inode = f2fs_iget_nowait(sbi->sb, ino);
+ inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
return;
@@ -375,5 +371,5 @@ void recover_fsync_data(struct f2fs_sb_info *sbi)
out:
destroy_fsync_dnodes(sbi, &inode_list);
kmem_cache_destroy(fsync_entry_slab);
- write_checkpoint(sbi, false, false);
+ write_checkpoint(sbi, false);
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index de6240922b0..777f17e496e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -29,9 +29,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
- if (has_not_enough_free_secs(sbi)) {
+ if (has_not_enough_free_secs(sbi, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, 1);
+ f2fs_gc(sbi);
}
}
@@ -308,7 +308,7 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
* If there is not enough reserved sections,
* we should not reuse prefree segments.
*/
- if (has_not_enough_free_secs(sbi))
+ if (has_not_enough_free_secs(sbi, 0))
return NULL_SEGNO;
/*
@@ -536,6 +536,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
}
}
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+
+ if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+ return v_ops->get_victim(sbi,
+ &(curseg)->next_segno, BG_GC, type, SSR);
+
+ /* For data segments, let's do SSR more intensively */
+ for (; type >= CURSEG_HOT_DATA; type--)
+ if (v_ops->get_victim(sbi, &(curseg)->next_segno,
+ BG_GC, type, SSR))
+ return 1;
+ return 0;
+}
+
/*
* flush out current segment and replace it with new segment
* This function should be returned with success, otherwise BUG
@@ -600,6 +617,7 @@ static void f2fs_end_io_write(struct bio *bio, int err)
if (page->mapping)
set_bit(AS_EIO, &page->mapping->flags);
set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+ p->sbi->sb->s_flags |= MS_RDONLY;
}
end_page_writeback(page);
dec_page_count(p->sbi, F2FS_WRITEBACK);
@@ -815,15 +833,10 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
}
-int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
- struct writeback_control *wbc)
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE;
-
set_page_writeback(page);
submit_write_page(sbi, page, page->index, META);
- return 0;
}
void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 66a288a52fd..552dadbb232 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -450,29 +450,16 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
return (free_sections(sbi) < overprovision_sections(sbi));
}
-static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
- return DIRTY_I(sbi)->v_ops->get_victim(sbi,
- &(curseg)->next_segno, BG_GC, type, SSR);
-}
-
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
-{
- unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
- sbi->segs_per_sec;
- int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
- int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
if (sbi->por_doing)
return false;
- if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi)))
- return true;
- return false;
+ return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi)));
}
static inline int utilization(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 08a94c814bd..8c117649a03 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = {
{Opt_err, NULL},
};
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ va_end(args);
+}
+
static void init_once(void *foo)
{
struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
@@ -100,7 +112,7 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
- write_checkpoint(sbi, false, true);
+ write_checkpoint(sbi, true);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -124,11 +136,29 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
return 0;
if (sync)
- write_checkpoint(sbi, false, false);
+ write_checkpoint(sbi, false);
+ else
+ f2fs_balance_fs(sbi);
return 0;
}
+static int f2fs_freeze(struct super_block *sb)
+{
+ int err;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ err = f2fs_sync_fs(sb, 1);
+ return err;
+}
+
+static int f2fs_unfreeze(struct super_block *sb)
+{
+ return 0;
+}
+
static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -184,7 +214,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noacl");
#endif
if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
- seq_puts(seq, ",disable_ext_indentify");
+ seq_puts(seq, ",disable_ext_identify");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
@@ -199,6 +229,8 @@ static struct super_operations f2fs_sops = {
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
.sync_fs = f2fs_sync_fs,
+ .freeze_fs = f2fs_freeze,
+ .unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
};
@@ -247,7 +279,8 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static int parse_options(struct f2fs_sb_info *sbi, char *options)
+static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
+ char *options)
{
substring_t args[MAX_OPT_ARGS];
char *p;
@@ -286,7 +319,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
break;
#else
case Opt_nouser_xattr:
- pr_info("nouser_xattr options not supported\n");
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -295,7 +329,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
break;
#else
case Opt_noacl:
- pr_info("noacl options not supported\n");
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
break;
#endif
case Opt_active_logs:
@@ -309,8 +343,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
set_opt(sbi, DISABLE_EXT_IDENTIFY);
break;
default:
- pr_err("Unrecognized mount option \"%s\" or missing value\n",
- p);
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
return -EINVAL;
}
}
@@ -337,30 +372,53 @@ static loff_t max_file_size(unsigned bits)
return result;
}
-static int sanity_check_raw_super(struct f2fs_super_block *raw_super)
+static int sanity_check_raw_super(struct super_block *sb,
+ struct f2fs_super_block *raw_super)
{
unsigned int blocksize;
- if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic))
+ if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Magic Mismatch, valid(0x%x) - read(0x%x)",
+ F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
return 1;
+ }
+
+ /* Currently, support only 4KB page cache size */
+ if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid page_cache_size (%lu), supports only 4KB\n",
+ PAGE_CACHE_SIZE);
+ return 1;
+ }
/* Currently, support only 4KB block size */
blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
- if (blocksize != PAGE_CACHE_SIZE)
+ if (blocksize != F2FS_BLKSIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid blocksize (%u), supports only 4KB\n",
+ blocksize);
return 1;
+ }
+
if (le32_to_cpu(raw_super->log_sectorsize) !=
- F2FS_LOG_SECTOR_SIZE)
+ F2FS_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
return 1;
+ }
if (le32_to_cpu(raw_super->log_sectors_per_block) !=
- F2FS_LOG_SECTORS_PER_BLOCK)
+ F2FS_LOG_SECTORS_PER_BLOCK) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
return 1;
+ }
return 0;
}
-static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
- struct f2fs_checkpoint *ckpt)
+static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -371,6 +429,11 @@ static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
if (fsmeta >= total)
return 1;
+
+ if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+ return 1;
+ }
return 0;
}
@@ -399,6 +462,32 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
}
+static int validate_superblock(struct super_block *sb,
+ struct f2fs_super_block **raw_super,
+ struct buffer_head **raw_super_buf, sector_t block)
+{
+ const char *super = (block == 0 ? "first" : "second");
+
+ /* read f2fs raw super block */
+ *raw_super_buf = sb_bread(sb, block);
+ if (!*raw_super_buf) {
+ f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
+ super);
+ return 1;
+ }
+
+ *raw_super = (struct f2fs_super_block *)
+ ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+
+ /* sanity checking of raw super */
+ if (!sanity_check_raw_super(sb, *raw_super))
+ return 0;
+
+ f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+ "in %s superblock", super);
+ return 1;
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
@@ -413,19 +502,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi)
return -ENOMEM;
- /* set a temporary block size */
- if (!sb_set_blocksize(sb, F2FS_BLKSIZE))
- goto free_sbi;
-
- /* read f2fs raw super block */
- raw_super_buf = sb_bread(sb, 0);
- if (!raw_super_buf) {
- err = -EIO;
+ /* set a block size */
+ if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+ f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
goto free_sbi;
}
- raw_super = (struct f2fs_super_block *)
- ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
+ if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) {
+ brelse(raw_super_buf);
+ if (validate_superblock(sb, &raw_super, &raw_super_buf, 1))
+ goto free_sb_buf;
+ }
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -438,11 +525,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- if (parse_options(sbi, (char *)data))
- goto free_sb_buf;
-
- /* sanity checking of raw super */
- if (sanity_check_raw_super(raw_super))
+ if (parse_options(sb, sbi, (char *)data))
goto free_sb_buf;
sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
@@ -477,18 +560,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
goto free_sb_buf;
}
err = get_valid_checkpoint(sbi);
- if (err)
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
goto free_meta_inode;
+ }
/* sanity checking of checkpoint */
err = -EINVAL;
- if (sanity_check_ckpt(raw_super, sbi->ckpt))
+ if (sanity_check_ckpt(sbi)) {
+ f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
goto free_cp;
+ }
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
@@ -502,25 +590,28 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->dir_inode_list);
spin_lock_init(&sbi->dir_inode_lock);
- /* init super block */
- if (!sb_set_blocksize(sb, sbi->blocksize))
- goto free_cp;
-
init_orphan_info(sbi);
/* setup f2fs internal modules */
err = build_segment_manager(sbi);
- if (err)
+ if (err) {
+ f2fs_msg(sb, KERN_ERR,
+ "Failed to initialize F2FS segment manager");
goto free_sm;
+ }
err = build_node_manager(sbi);
- if (err)
+ if (err) {
+ f2fs_msg(sb, KERN_ERR,
+ "Failed to initialize F2FS node manager");
goto free_nm;
+ }
build_gc_manager(sbi);
/* get an inode for node space */
sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
if (IS_ERR(sbi->node_inode)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
err = PTR_ERR(sbi->node_inode);
goto free_nm;
}
@@ -533,6 +624,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
if (IS_ERR(root)) {
+ f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
err = PTR_ERR(root);
goto free_node_inode;
}
@@ -596,7 +688,7 @@ static struct file_system_type f2fs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
sizeof(struct f2fs_inode_info), NULL);
@@ -631,14 +723,17 @@ static int __init init_f2fs_fs(void)
err = create_checkpoint_caches();
if (err)
goto fail;
- return register_filesystem(&f2fs_fs_type);
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto fail;
+ f2fs_create_root_stats();
fail:
return err;
}
static void __exit exit_f2fs_fs(void)
{
- destroy_root_stats();
+ f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
destroy_checkpoint_caches();
destroy_gc_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 940136a3d3a..8038c049650 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -318,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
if (name_len > 255 || value_len > MAX_VALUE_LEN)
return -ERANGE;
+ f2fs_balance_fs(sbi);
+
mutex_lock_op(sbi, NODE_NEW);
if (!fi->i_xattr_nid) {
/* Allocate new attribute block */
diff --git a/fs/file.c b/fs/file.c
index 2b3570b7cae..3906d9577a1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -516,7 +516,7 @@ struct files_struct init_files = {
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
},
- .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+ .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
/*
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 0cf160a94ed..1b2f6c2c3aa 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -4,12 +4,24 @@ config FUSE_FS
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
- There's also companion library: libfuse. This library along with
- utilities is available from the FUSE homepage:
+ There's also a companion library: libfuse2. This library is available
+ from the FUSE homepage:
<http://fuse.sourceforge.net/>
+ although chances are your distribution already has that library
+ installed if you've installed the "fuse" package itself.
See <file:Documentation/filesystems/fuse.txt> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+ tristate "Character device in Userspace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use a userspace character device
+ based on CUSE, answer Y or M.
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index ee8d5504229..6f96a8def14 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -45,7 +45,6 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/module.h>
@@ -63,7 +62,7 @@ struct cuse_conn {
bool unrestricted_ioctl;
};
-static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */
+static DEFINE_MUTEX(cuse_lock); /* protects registration */
static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
static struct class *cuse_class;
@@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
loff_t pos = 0;
+ struct iovec iov = { .iov_base = buf, .iov_len = count };
- return fuse_direct_io(file, buf, count, &pos, 0);
+ return fuse_direct_io(file, &iov, 1, count, &pos, 0);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
loff_t pos = 0;
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(file, buf, count, &pos, 1);
+ return fuse_direct_io(file, &iov, 1, count, &pos, 1);
}
static int cuse_open(struct inode *inode, struct file *file)
@@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file)
int rc;
/* look up and get the connection */
- spin_lock(&cuse_lock);
+ mutex_lock(&cuse_lock);
list_for_each_entry(pos, cuse_conntbl_head(devt), list)
if (pos->dev->devt == devt) {
fuse_conn_get(&pos->fc);
cc = pos;
break;
}
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* dead? */
if (!cc)
@@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
{
char *end = p + len;
- char *key, *val;
+ char *uninitialized_var(key), *uninitialized_var(val);
int rc;
while (true) {
@@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev)
*/
static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
- struct cuse_conn *cc = fc_to_cc(fc);
+ struct cuse_conn *cc = fc_to_cc(fc), *pos;
struct cuse_init_out *arg = req->out.args[0].value;
struct page *page = req->pages[0];
struct cuse_devinfo devinfo = { };
struct device *dev;
struct cdev *cdev;
dev_t devt;
- int rc;
+ int rc, i;
if (req->out.h.error ||
arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
@@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
dev_set_drvdata(dev, cc);
dev_set_name(dev, "%s", devinfo.name);
+ mutex_lock(&cuse_lock);
+
+ /* make sure the device-name is unique */
+ for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+ list_for_each_entry(pos, &cuse_conntbl[i], list)
+ if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+ goto err_unlock;
+ }
+
rc = device_add(dev);
if (rc)
- goto err_device;
+ goto err_unlock;
/* register cdev */
rc = -ENOMEM;
cdev = cdev_alloc();
if (!cdev)
- goto err_device;
+ goto err_unlock;
cdev->owner = THIS_MODULE;
cdev->ops = &cuse_frontend_fops;
@@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
cc->cdev = cdev;
/* make the device available */
- spin_lock(&cuse_lock);
list_add(&cc->list, cuse_conntbl_head(devt));
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* announce device availability */
dev_set_uevent_suppress(dev, 0);
@@ -391,7 +401,8 @@ out:
err_cdev:
cdev_del(cdev);
-err_device:
+err_unlock:
+ mutex_unlock(&cuse_lock);
put_device(dev);
err_region:
unregister_chrdev_region(devt, 1);
@@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc)
BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
goto err;
@@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc)
req->out.argvar = 1;
req->out.argpages = 1;
req->pages[0] = page;
+ req->page_descs[0].length = req->out.args[1].size;
req->num_pages = 1;
req->end = cuse_process_init_reply;
fuse_request_send_background(fc, req);
@@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
int rc;
/* remove from the conntbl, no more access from this point on */
- spin_lock(&cuse_lock);
+ mutex_lock(&cuse_lock);
list_del_init(&cc->list);
- spin_unlock(&cuse_lock);
+ mutex_unlock(&cuse_lock);
/* remove device */
if (cc->dev)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c16335315e5..e9bdec0b16d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)
return file->private_data;
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned npages)
{
memset(req, 0, sizeof(*req));
+ memset(pages, 0, sizeof(*pages) * npages);
+ memset(page_descs, 0, sizeof(*page_descs) * npages);
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
atomic_set(&req->count, 1);
+ req->pages = pages;
+ req->page_descs = page_descs;
+ req->max_pages = npages;
}
-struct fuse_req *fuse_request_alloc(void)
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
- if (req)
- fuse_request_init(req);
+ struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+ if (req) {
+ struct page **pages;
+ struct fuse_page_desc *page_descs;
+
+ if (npages <= FUSE_REQ_INLINE_PAGES) {
+ pages = req->inline_pages;
+ page_descs = req->inline_page_descs;
+ } else {
+ pages = kmalloc(sizeof(struct page *) * npages, flags);
+ page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+ npages, flags);
+ }
+
+ if (!pages || !page_descs) {
+ kfree(pages);
+ kfree(page_descs);
+ kmem_cache_free(fuse_req_cachep, req);
+ return NULL;
+ }
+
+ fuse_request_init(req, pages, page_descs, npages);
+ }
return req;
}
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+ return __fuse_request_alloc(npages, GFP_KERNEL);
+}
EXPORT_SYMBOL_GPL(fuse_request_alloc);
-struct fuse_req *fuse_request_alloc_nofs(void)
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
- if (req)
- fuse_request_init(req);
- return req;
+ return __fuse_request_alloc(npages, GFP_NOFS);
}
void fuse_request_free(struct fuse_req *req)
{
+ if (req->pages != req->inline_pages) {
+ kfree(req->pages);
+ kfree(req->page_descs);
+ }
kmem_cache_free(fuse_req_cachep, req);
}
@@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req)
req->in.h.pid = current->pid;
}
-struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
{
struct fuse_req *req;
sigset_t oldset;
@@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
if (!fc->connected)
goto out;
- req = fuse_request_alloc();
+ req = fuse_request_alloc(npages);
err = -ENOMEM;
if (!req)
goto out;
@@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
struct fuse_file *ff = file->private_data;
spin_lock(&fc->lock);
- fuse_request_init(req);
+ fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
@@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
* filesystem should not have it's own file open. If deadlock is
* intentional, it can still be broken by "aborting" the filesystem.
*/
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file)
{
struct fuse_req *req;
atomic_inc(&fc->num_waiting);
wait_event(fc->blocked_waitq, !fc->blocked);
- req = fuse_request_alloc();
+ req = fuse_request_alloc(0);
if (!req)
req = get_reserved_req(fc, file);
@@ -406,9 +440,8 @@ __acquires(fc->lock)
}
}
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
- req->isreply = 1;
spin_lock(&fc->lock);
if (!fc->connected)
req->out.h.error = -ENOTCONN;
@@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
spin_unlock(&fc->lock);
}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+ req->isreply = 1;
+ __fuse_request_send(fc, req);
+}
EXPORT_SYMBOL_GPL(fuse_request_send);
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
@@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fuse_request_send_nowait_locked(fc, req);
}
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ struct fuse_forget_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.nlookup = 1;
+ req = fuse_get_req_nofail_nopages(fc, file);
+ req->in.h.opcode = FUSE_FORGET;
+ req->in.h.nodeid = nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ req->isreply = 0;
+ __fuse_request_send(fc, req);
+ /* ignore errors */
+ fuse_put_request(fc, req);
+}
+
/*
* Lock the request. Up to the next unlock_request() there mustn't be
* anything that could cause a page-fault. If the request was already
@@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *oldpage = *pagep;
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
- struct address_space *mapping;
- pgoff_t index;
unlock_request(cs->fc, cs->req);
fuse_copy_finish(cs);
@@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (fuse_check_page(newpage) != 0)
goto out_fallback_unlock;
- mapping = oldpage->mapping;
- index = oldpage->index;
-
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
@@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
{
unsigned i;
struct fuse_req *req = cs->req;
- unsigned offset = req->page_offset;
- unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
int err;
+ unsigned offset = req->page_descs[i].offset;
+ unsigned count = min(nbytes, req->page_descs[i].length);
err = fuse_copy_page(cs, &req->pages[i], offset, count,
zeroing);
@@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
return err;
nbytes -= count;
- count = min(nbytes, (unsigned) PAGE_SIZE);
- offset = 0;
}
return 0;
}
@@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
+ int num_pages;
+
+ offset = outarg->offset & ~PAGE_CACHE_MASK;
+ file_size = i_size_read(inode);
+
+ num = outarg->size;
+ if (outarg->offset > file_size)
+ num = 0;
+ else if (outarg->offset + num > file_size)
+ num = file_size - outarg->offset;
- req = fuse_get_req(fc);
+ num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+ req = fuse_get_req(fc, num_pages);
if (IS_ERR(req))
return PTR_ERR(req);
- offset = outarg->offset & ~PAGE_CACHE_MASK;
-
req->in.h.opcode = FUSE_NOTIFY_REPLY;
req->in.h.nodeid = outarg->nodeid;
req->in.numargs = 2;
req->in.argpages = 1;
- req->page_offset = offset;
+ req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
index = outarg->offset >> PAGE_CACHE_SHIFT;
- file_size = i_size_read(inode);
- num = outarg->size;
- if (outarg->offset > file_size)
- num = 0;
- else if (outarg->offset + num > file_size)
- num = file_size - outarg->offset;
- while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+ while (num && req->num_pages < num_pages) {
struct page *page;
unsigned int this_num;
@@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
offset = 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 80ba3950c40..ff15522481d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,29 @@
#include <linux/namei.h>
#include <linux/slab.h>
+static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (filp->f_pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
#if BITS_PER_LONG >= 64
static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
{
@@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
return -ECHILD;
fc = get_fuse_conn(inode);
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return 0;
@@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
attr_version);
fuse_change_entry_timeout(entry, &outarg);
}
+ fuse_advise_use_readdirplus(inode);
return 1;
}
@@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
if (name->len > FUSE_NAME_MAX)
goto out;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out;
@@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
else
fuse_invalidate_entry_cache(entry);
+ fuse_advise_use_readdirplus(dir);
return newent;
out_iput:
@@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!forget)
goto out_err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out_put_forget_req;
@@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
{
struct fuse_mknod_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
{
struct fuse_conn *fc = get_fuse_conn(dir);
unsigned len = strlen(link) + 1;
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- drop_nlink(inode);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
@@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
int err;
struct fuse_rename_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_link_in inarg;
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
struct fuse_req *req;
u64 attr_version;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
/*
* Calling into a user-controlled filesystem gives the filesystem
- * daemon ptrace-like capabilities over the requester process. This
+ * daemon ptrace-like capabilities over the current process. This
* means, that the filesystem daemon is able to record the exact
* filesystem operations performed, and can also control the behavior
* of the requester process in otherwise impossible ways. For example
@@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
* for which the owner of the mount has ptrace privilege. This
* excludes processes started by other users, suid or sgid processes.
*/
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
- int ret;
if (fc->flags & FUSE_ALLOW_OTHER)
return 1;
- rcu_read_lock();
- ret = 0;
- cred = __task_cred(task);
+ cred = current_cred();
if (uid_eq(cred->euid, fc->user_id) &&
uid_eq(cred->suid, fc->user_id) &&
uid_eq(cred->uid, fc->user_id) &&
gid_eq(cred->egid, fc->group_id) &&
gid_eq(cred->sgid, fc->group_id) &&
gid_eq(cred->gid, fc->group_id))
- ret = 1;
- rcu_read_unlock();
+ return 1;
- return ret;
+ return 0;
}
static int fuse_access(struct inode *inode, int mask)
@@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask)
if (fc->no_access)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask)
bool refreshed = false;
int err = 0;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
/*
@@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
return 0;
}
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
{
int err;
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = parent->d_inode;
+ struct fuse_conn *fc;
+ struct inode *inode;
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (dentry && dentry->d_inode) {
+ inode = dentry->d_inode;
+ if (get_node_id(inode) == o->nodeid) {
+ struct fuse_inode *fi;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+
+ /*
+ * The other branch to 'found' comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ goto found;
+ }
+ err = d_invalidate(dentry);
+ if (err)
+ goto out;
+ dput(dentry);
+ dentry = NULL;
+ }
+
+ dentry = d_alloc(parent, &name);
+ err = -ENOMEM;
+ if (!dentry)
+ goto out;
+
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o), attr_version);
+ if (!inode)
+ goto out;
+
+ alias = d_materialise_unique(dentry, inode);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias))
+ goto out;
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+
+found:
+ fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
+ attr_version);
+
+ fuse_change_entry_timeout(dentry, o);
+
+ err = 0;
+out:
+ if (dentry)
+ dput(dentry);
+ return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ void *dstbuf, filldir_t filldir, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = filldir(dstbuf, dirent->name, dirent->namelen,
+ file->f_pos, dirent->ino,
+ dirent->type);
+ file->f_pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+ int plus, err;
size_t nbytes;
struct page *page;
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
+ u64 attr_version = 0;
if (is_bad_inode(inode))
return -EIO;
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
fuse_put_request(fc, req);
return -ENOMEM;
}
+
+ plus = fuse_use_readdirplus(inode, file);
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+ req->page_descs[0].length = PAGE_SIZE;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fc);
+ fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
- if (!err)
- err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
- filldir);
+ if (!err) {
+ if (plus) {
+ err = parse_dirplusfile(page_address(page), nbytes,
+ file, dstbuf, filldir,
+ attr_version);
+ } else {
+ err = parse_dirfile(page_address(page), nbytes, file,
+ dstbuf, filldir);
+ }
+ }
__free_page(page);
fuse_invalidate_attr(inode); /* atime changed */
@@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_req(fc);
+ struct fuse_req *req = fuse_get_req_nopages(fc);
char *link;
if (IS_ERR(req))
@@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
loff_t oldsize;
int err;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
@@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
if (attr->ia_valid & ATTR_SIZE)
is_truncate = true;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
return fuse_update_attributes(inode, stat, NULL, NULL);
@@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
if (fc->no_setxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
if (fc->no_getxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (fc->no_listxattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
if (fc->no_removexattr)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e21d4d8f87e..c8071768b95 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
struct fuse_req *req;
int err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
return NULL;
ff->fc = fc;
- ff->reserved_req = fuse_request_alloc();
+ ff->reserved_req = fuse_request_alloc(0);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
@@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fc->no_flush)
return 0;
- req = fuse_get_req_nofail(fc, file);
+ req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
fuse_sync_writes(inode);
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page)
*/
fuse_wait_on_page_writeback(inode, page->index);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, 1);
err = PTR_ERR(req);
if (IS_ERR(req))
goto out;
@@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page)
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
+ req->page_descs[0].length = count;
num_read = fuse_send_read(req, file, pos, count, NULL);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -641,6 +642,7 @@ struct fuse_fill_data {
struct fuse_req *req;
struct file *file;
struct inode *inode;
+ unsigned nr_pages;
};
static int fuse_readpages_fill(void *_data, struct page *page)
@@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page)
(req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+ int nr_alloc = min_t(unsigned, data->nr_pages,
+ FUSE_MAX_PAGES_PER_REQ);
fuse_send_readpages(req, data->file);
- data->req = req = fuse_get_req(fc);
+ data->req = req = fuse_get_req(fc, nr_alloc);
if (IS_ERR(req)) {
unlock_page(page);
return PTR_ERR(req);
}
}
+
+ if (WARN_ON(req->num_pages >= req->max_pages)) {
+ fuse_put_request(fc, req);
+ return -EIO;
+ }
+
page_cache_get(page);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = PAGE_SIZE;
req->num_pages++;
+ data->nr_pages--;
return 0;
}
@@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
int err;
+ int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
err = -EIO;
if (is_bad_inode(inode))
@@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
data.file = file;
data.inode = inode;
- data.req = fuse_get_req(fc);
+ data.req = fuse_get_req(fc, nr_alloc);
+ data.nr_pages = nr_pages;
err = PTR_ERR(data.req);
if (IS_ERR(data.req))
goto out;
@@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
res = fuse_send_write(req, file, pos, count, NULL);
- offset = req->page_offset;
+ offset = req->page_descs[0].offset;
count = res;
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
@@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
int err;
req->in.argpages = 1;
- req->page_offset = offset;
+ req->page_descs[0].offset = offset;
do {
size_t tmp;
@@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
err = 0;
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = tmp;
req->num_pages++;
iov_iter_advance(ii, tmp);
@@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
- req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+ req->num_pages < req->max_pages && offset == 0);
return count > 0 ? count : err;
}
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+ return min_t(unsigned,
+ ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+ (pos >> PAGE_CACHE_SHIFT) + 1,
+ FUSE_MAX_PAGES_PER_REQ);
+}
+
static ssize_t fuse_perform_write(struct file *file,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos)
@@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file,
do {
struct fuse_req *req;
ssize_t count;
+ unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, nr_pages);
if (IS_ERR(req)) {
err = PTR_ERR(req);
break;
@@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
-static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+ unsigned index, unsigned nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ req->page_descs[i].length = PAGE_SIZE -
+ req->page_descs[i].offset;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+ return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+ size_t max_size)
+{
+ return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t *nbytesp, int write)
{
- size_t nbytes = *nbytesp;
- unsigned long user_addr = (unsigned long) buf;
- unsigned offset = user_addr & ~PAGE_MASK;
- int npages;
+ size_t nbytes = 0; /* # bytes already packed in req */
/* Special case for kernel I/O: can copy directly into the buffer */
if (segment_eq(get_fs(), KERNEL_DS)) {
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
if (write)
req->in.args[1].value = (void *) user_addr;
else
req->out.args[0].value = (void *) user_addr;
+ iov_iter_advance(ii, frag_size);
+ *nbytesp = frag_size;
return 0;
}
- nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
- npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
- npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
- if (npages < 0)
- return npages;
+ while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+ unsigned npages;
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ unsigned offset = user_addr & ~PAGE_MASK;
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
+ int ret;
+
+ unsigned n = req->max_pages - req->num_pages;
+ frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
+
+ npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ npages = clamp(npages, 1U, n);
+
+ ret = get_user_pages_fast(user_addr, npages, !write,
+ &req->pages[req->num_pages]);
+ if (ret < 0)
+ return ret;
- req->num_pages = npages;
- req->page_offset = offset;
+ npages = ret;
+ frag_size = min_t(size_t, frag_size,
+ (npages << PAGE_SHIFT) - offset);
+ iov_iter_advance(ii, frag_size);
+
+ req->page_descs[req->num_pages].offset = offset;
+ fuse_page_descs_length_init(req, req->num_pages, npages);
+
+ req->num_pages += npages;
+ req->page_descs[req->num_pages - 1].length -=
+ (npages << PAGE_SHIFT) - offset - frag_size;
+
+ nbytes += frag_size;
+ }
if (write)
req->in.argpages = 1;
else
req->out.argpages = 1;
- nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
- *nbytesp = min(*nbytesp, nbytes);
+ *nbytesp = nbytes;
return 0;
}
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write)
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
+{
+ struct iov_iter ii = *ii_p;
+ int npages = 0;
+
+ while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
+ unsigned long user_addr = fuse_get_user_addr(&ii);
+ unsigned offset = user_addr & ~PAGE_MASK;
+ size_t frag_size = iov_iter_single_seg_count(&ii);
+
+ npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ iov_iter_advance(&ii, frag_size);
+ }
+
+ return min(npages, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, size_t count, loff_t *ppos,
+ int write)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
loff_t pos = *ppos;
ssize_t res = 0;
struct fuse_req *req;
+ struct iov_iter ii;
+
+ iov_iter_init(&ii, iov, nr_segs, count, 0);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, buf, &nbytes, write);
+ int err = fuse_get_user_pages(req, &ii, &nbytes, write);
if (err) {
res = err;
break;
@@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
count -= nres;
res += nres;
pos += nres;
- buf += nres;
if (nres != nbytes)
break;
if (count) {
fuse_put_request(fc, req);
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, fuse_iter_npages(&ii));
if (IS_ERR(req))
break;
}
@@ -1122,8 +1211,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
ssize_t res;
struct inode *inode = file->f_path.dentry->d_inode;
@@ -1131,22 +1220,31 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
if (is_bad_inode(inode))
return -EIO;
- res = fuse_direct_io(file, buf, count, ppos, 0);
+ res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs),
+ ppos, 0);
fuse_invalidate_attr(inode);
return res;
}
-static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov = { .iov_base = buf, .iov_len = count };
+ return __fuse_direct_read(file, &iov, 1, ppos);
+}
+
+static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
struct inode *inode = file->f_path.dentry->d_inode;
+ size_t count = iov_length(iov, nr_segs);
ssize_t res;
res = generic_write_checks(file, ppos, &count, 0);
if (!res) {
- res = fuse_direct_io(file, buf, count, ppos, 1);
+ res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
if (res > 0)
fuse_write_update_size(inode, *ppos);
}
@@ -1159,6 +1257,7 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t res;
@@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(file, buf, count, ppos);
+ res = __fuse_direct_write(file, &iov, 1, ppos);
mutex_unlock(&inode->i_mutex);
return res;
@@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page)
set_page_writeback(page);
- req = fuse_request_alloc_nofs();
+ req = fuse_request_alloc_nofs(1);
if (!req)
goto err;
@@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page)
req->in.argpages = 1;
req->num_pages = 1;
req->pages[0] = tmp_page;
- req->page_offset = 0;
+ req->page_descs[0].offset = 0;
+ req->page_descs[0].length = PAGE_SIZE;
req->end = fuse_writepage_end;
req->inode = inode;
@@ -1471,7 +1571,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
struct fuse_lk_out outarg;
int err;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if (fl->fl_flags & FL_CLOSE)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
if (!inode->i_sb->s_bdev || fc->no_bmap)
return 0;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return 0;
@@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
num_pages++;
}
- req = fuse_get_req(fc);
+ req = fuse_get_req(fc, num_pages);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
req->num_pages = num_pages;
+ fuse_page_descs_length_init(req, 0, req->num_pages);
/* okay, let's send it to the client */
req->in.h.opcode = FUSE_IOCTL;
@@ -1981,7 +2082,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
struct inode *inode = file->f_dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!fuse_allow_task(fc, current))
+ if (!fuse_allow_current_process(fc))
return -EACCES;
if (is_bad_inode(inode))
@@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
+ inarg.events = (__u32)poll_requested_events(wait);
/*
* Ask for notification iff there's someone waiting for it.
@@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
fuse_register_polled_file(fc, ff);
}
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return POLLERR;
@@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
return 0;
}
-static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos, int rw)
-{
- const struct iovec *vector = iov;
- ssize_t ret = 0;
-
- while (nr_segs > 0) {
- void __user *base;
- size_t len;
- ssize_t nr;
-
- base = vector->iov_base;
- len = vector->iov_len;
- vector++;
- nr_segs--;
-
- if (rw == WRITE)
- nr = __fuse_direct_write(filp, base, len, ppos);
- else
- nr = fuse_direct_read(filp, base, len, ppos);
-
- if (nr < 0) {
- if (!ret)
- ret = nr;
- break;
- }
- ret += nr;
- if (nr != len)
- break;
- }
-
- return ret;
-}
-
-
static ssize_t
fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
file = iocb->ki_filp;
pos = offset;
- ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
+ if (rw == WRITE)
+ ret = __fuse_direct_write(file, iov, nr_segs, &pos);
+ else
+ ret = __fuse_direct_read(file, iov, nr_segs, &pos);
return ret;
}
-long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
- loff_t length)
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t length)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (fc->no_fallocate)
return -EOPNOTSUPP;
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return err;
}
-EXPORT_SYMBOL_GPL(fuse_file_fallocate);
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e105a53fc72..6aeba864f07 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,6 +44,9 @@
doing the mount will be allowed to access the filesystem */
#define FUSE_ALLOW_OTHER (1 << 1)
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
/** List of active connections */
extern struct list_head fuse_conn_list;
@@ -103,6 +106,15 @@ struct fuse_inode {
/** List of writepage requestst (pending or sent) */
struct list_head writepages;
+
+ /** Miscellaneous bits describing inode state */
+ unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+ /** Advise readdirplus */
+ FUSE_I_ADVISE_RDPLUS,
};
struct fuse_conn;
@@ -200,6 +212,12 @@ struct fuse_out {
struct fuse_arg args[3];
};
+/** FUSE page descriptor */
+struct fuse_page_desc {
+ unsigned int length;
+ unsigned int offset;
+};
+
/** The request state */
enum fuse_req_state {
FUSE_REQ_INIT = 0,
@@ -291,14 +309,23 @@ struct fuse_req {
} misc;
/** page vector */
- struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+ struct page **pages;
+
+ /** page-descriptor vector */
+ struct fuse_page_desc *page_descs;
+
+ /** size of the 'pages' array */
+ unsigned max_pages;
+
+ /** inline page vector */
+ struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+ /** inline page-descriptor vector */
+ struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
/** number of pages in vector */
unsigned num_pages;
- /** offset of data on first page */
- unsigned page_offset;
-
/** File used in the request (or NULL) */
struct fuse_file *ff;
@@ -487,6 +514,12 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
+ /** Does the filesystem support readdirplus? */
+ unsigned do_readdirplus:1;
+
+ /** Does the filesystem want adaptive readdirplus? */
+ unsigned readdirplus_auto:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
/**
* Initialize READ or READDIR request
*/
@@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void);
/**
* Allocate a request
*/
-struct fuse_req *fuse_request_alloc(void);
+struct fuse_req *fuse_request_alloc(unsigned npages);
-struct fuse_req *fuse_request_alloc_nofs(void);
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
/**
* Free a request
@@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void);
void fuse_request_free(struct fuse_req *req);
/**
- * Get a request, may fail with -ENOMEM
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
*/
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * useful for callers who doesn't use req->pages[]
+ */
+static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
+{
+ return fuse_get_req(fc, 0);
+}
/**
* Gets a requests for a file operation, always succeeds
*/
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file);
/**
* Decrement reference count of a request. If count goes to zero free
@@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
int fuse_valid_type(int m);
/**
- * Is task allowed to perform filesystem operation?
+ * Is current process allowed to perform filesystem operation?
*/
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write);
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, size_t count, loff_t *ppos,
+ int write);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b730fda9aa4..df00993ed10 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->attr_version = 0;
fi->writectr = 0;
fi->orig_ino = 0;
+ fi->state = 0;
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
INIT_LIST_HEAD(&fi->writepages);
@@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_task(fc, current)) {
+ if (!fuse_allow_current_process(fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
- req = fuse_get_req(fc);
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
+ if (arg->flags & FUSE_DO_READDIRPLUS)
+ fc->do_readdirplus = 1;
+ if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
- FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
+ FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
/* only now - we want root dentry with NULL ->d_op */
sb->s_d_op = &fuse_dentry_operations;
- init_req = fuse_request_alloc();
+ init_req = fuse_request_alloc(0);
if (!init_req)
goto err_put_root;
if (is_bdev) {
- fc->destroy_req = fuse_request_alloc();
+ fc->destroy_req = fuse_request_alloc(0);
if (!fc->destroy_req)
goto err_free_init_req;
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f850020ad90..f69ac0af549 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
return -EINVAL;
if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
return -EPERM;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 30de4f2a2ea..24f414f0ce6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
continue;
if (gfs2_is_jdata(ip))
set_buffer_uptodate(bh);
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
}
}
@@ -230,16 +230,14 @@ out_ignore:
}
/**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
* @mapping: The mapping to write
* @wbc: Write-back control
*
- * For the data=writeback case we can already ignore buffer heads
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
+ * Used for both ordered and writeback modes.
*/
-static int gfs2_writeback_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int gfs2_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
}
@@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
goto failed;
}
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip))
return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
@@ -1102,7 +1100,7 @@ cannot_release:
static const struct address_space_operations gfs2_writeback_aops = {
.writepage = gfs2_writeback_writepage,
- .writepages = gfs2_writeback_writepages,
+ .writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
@@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
static const struct address_space_operations gfs2_ordered_aops = {
.writepage = gfs2_ordered_writepage,
+ .writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
.write_begin = gfs2_write_begin,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a68e91bcef3..5e83657f046 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -22,6 +22,7 @@
#include "meta_io.h"
#include "quota.h"
#include "rgrp.h"
+#include "log.h"
#include "super.h"
#include "trans.h"
#include "dir.h"
@@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
if (!gfs2_is_jdata(ip))
mark_buffer_dirty(bh);
if (!gfs2_is_writeback(ip))
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
if (release) {
unlock_page(page);
@@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
/* Set up the pointer to the new block */
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
BUG_ON(i < 1);
BUG_ON(mp->mp_bh[i] != NULL);
mp->mp_bh[i] = gfs2_meta_new(gl, bn);
- gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+ gfs2_trans_add_meta(gl, mp->mp_bh[i]);
gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
ptr += offset;
@@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
BUG_ON(sheight < 1);
BUG_ON(dibh == NULL);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (height == sheight) {
struct buffer_head *bh;
@@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
/* Branching from existing tree */
case ALLOC_GROW_DEPTH:
if (i > 1 && i < height)
- gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+ gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
for (; i < height && n > 0; i++, n--)
gfs2_indirect_init(mp, ip->i_gl, i,
mp->mp_list[i-1], bn++);
@@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
case ALLOC_DATA:
BUG_ON(n > dblks);
BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
- gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+ gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
dblks = n;
ptr = metapointer(end_of_metadata, mp);
dblock = bn;
@@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
down_write(&ip->i_rw_mutex);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_trans_add_meta(ip->i_gl, bh);
bstart = 0;
blen = 0;
@@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
}
if (!gfs2_is_writeback(ip))
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_data(ip->i_gl, bh);
zero_user(page, offset, length);
mark_buffer_dirty(bh);
@@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
if (error)
goto out;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip)) {
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
@@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
return error;
@@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip)
ip->i_height = 0;
ip->i_goal = ip->i_no_addr;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ gfs2_ordered_del_inode(ip);
}
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size)
i_size_write(inode, size);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
inode_dio_wait(inode);
+ ret = gfs2_rs_alloc(GFS2_I(inode));
+ if (ret)
+ return ret;
+
oldsize = inode->i_size;
if (newsize >= oldsize)
return do_grow(inode, newsize);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9a35670fdc3..c3e82bd2317 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head *bh;
bh = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
*bhp = bh;
@@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
@@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
if (error)
goto fail;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
memcpy(bh->b_data + o, buf, amount);
brelse(bh);
@@ -231,7 +231,7 @@ out:
i_size_write(&ip->i_inode, offset + copied);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
return;
}
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
/* If there is no prev entry, this is the first entry in the block.
The de_rec_len is already as big as it needs to be. Just zero
@@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
totlen = be16_to_cpu(dent->de_rec_len);
BUG_ON(offset + name->len > totlen);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
ndent = (struct gfs2_dirent *)((char *)dent + offset);
dent->de_rec_len = cpu_to_be16(offset);
gfs2_qstr2dirent(name, totlen - offset, ndent);
@@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
return NULL;
gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
leaf = (struct gfs2_leaf *)bh->b_data;
leaf->lf_depth = cpu_to_be16(depth);
@@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode)
/* We're done with the new leaf block, now setup the new
hash table. */
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
@@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
return 1; /* can't split */
}
- gfs2_trans_add_bh(dip->i_gl, obh, 1);
+ gfs2_trans_add_meta(dip->i_gl, obh);
nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
if (!nleaf) {
@@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
error = gfs2_meta_inode_buffer(dip, &dibh);
if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
gfs2_add_inode_blocks(&dip->i_inode, 1);
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
@@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
return error;
} while(1);
- gfs2_trans_add_bh(ip->i_gl, obh, 1);
+ gfs2_trans_add_meta(ip->i_gl, obh);
leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
if (!leaf) {
@@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_add_inode_blocks(&ip->i_inode, 1);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
@@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
if (IS_ERR(dent))
return PTR_ERR(dent);
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(new_type);
@@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
error = gfs2_meta_inode_buffer(dip, &bh);
if (error)
return error;
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_trans_add_meta(dip->i_gl, bh);
}
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
@@ -1849,7 +1849,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (!ht)
return -ENOMEM;
- error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto out;
@@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out_end_trans;
- gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(dip->i_gl, dibh);
/* On the last dealloc, make this a regular file in case we crash.
(We don't want to free these blocks a second time.) */
if (last_dealloc)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 44543df9f40..019f45e4509 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_trans_end;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
@@ -483,7 +483,7 @@ out:
gfs2_holder_uninit(&gh);
if (ret == 0) {
set_page_dirty(page);
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
}
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -708,7 +708,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
if (unlikely(error))
return error;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip)) {
error = gfs2_unstuff_dinode(ip, NULL);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 992c5c0cb50..cf351554673 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -30,6 +30,7 @@
#include <linux/rculist_bl.h>
#include <linux/bit_spinlock.h>
#include <linux/percpu.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
gfs2_glock_put(gl);
}
+static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_glock *gla, *glb;
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
- struct shrink_control *sc)
+ gla = list_entry(a, struct gfs2_glock, gl_lru);
+ glb = list_entry(b, struct gfs2_glock, gl_lru);
+
+ if (gla->gl_name.ln_number > glb->gl_name.ln_number)
+ return 1;
+ if (gla->gl_name.ln_number < glb->gl_name.ln_number)
+ return -1;
+
+ return 0;
+}
+
+/**
+ * gfs2_dispose_glock_lru - Demote a list of glocks
+ * @list: The list to dispose of
+ *
+ * Disposing of glocks may involve disk accesses, so that here we sort
+ * the glocks by number (i.e. disk location of the inodes) so that if
+ * there are any such accesses, they'll be sent in order (mostly).
+ *
+ * Must be called under the lru_lock, but may drop and retake this
+ * lock. While the lru_lock is dropped, entries may vanish from the
+ * list, but no new entries will appear on the list (since it is
+ * private)
+ */
+
+static void gfs2_dispose_glock_lru(struct list_head *list)
+__releases(&lru_lock)
+__acquires(&lru_lock)
{
struct gfs2_glock *gl;
- int may_demote;
- int nr_skipped = 0;
- int nr = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
- LIST_HEAD(skipped);
- if (nr == 0)
- goto out;
+ list_sort(NULL, list, glock_cmp);
- if (!(gfp_mask & __GFP_FS))
- return -1;
+ while(!list_empty(list)) {
+ gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+ list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
+ gfs2_glock_hold(gl);
+ spin_unlock(&lru_lock);
+ spin_lock(&gl->gl_spin);
+ if (demote_ok(gl))
+ handle_callback(gl, LM_ST_UNLOCKED, 0);
+ WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
+ smp_mb__after_clear_bit();
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put_nolock(gl);
+ spin_unlock(&gl->gl_spin);
+ spin_lock(&lru_lock);
+ }
+}
+
+/**
+ * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
+ * @nr: The number of entries to scan
+ *
+ * This function selects the entries on the LRU which are able to
+ * be demoted, and then kicks off the process by calling
+ * gfs2_dispose_glock_lru() above.
+ */
+
+static void gfs2_scan_glock_lru(int nr)
+{
+ struct gfs2_glock *gl;
+ LIST_HEAD(skipped);
+ LIST_HEAD(dispose);
spin_lock(&lru_lock);
while(nr && !list_empty(&lru_list)) {
gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
- list_del_init(&gl->gl_lru);
- clear_bit(GLF_LRU, &gl->gl_flags);
- atomic_dec(&lru_count);
/* Test for being demotable */
if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- gfs2_glock_hold(gl);
- spin_unlock(&lru_lock);
- spin_lock(&gl->gl_spin);
- may_demote = demote_ok(gl);
- if (may_demote) {
- handle_callback(gl, LM_ST_UNLOCKED, 0);
- nr--;
- }
- clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_clear_bit();
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put_nolock(gl);
- spin_unlock(&gl->gl_spin);
- spin_lock(&lru_lock);
+ list_move(&gl->gl_lru, &dispose);
+ atomic_dec(&lru_count);
+ nr--;
continue;
}
- nr_skipped++;
- list_add(&gl->gl_lru, &skipped);
- set_bit(GLF_LRU, &gl->gl_flags);
+
+ list_move(&gl->gl_lru, &skipped);
}
list_splice(&skipped, &lru_list);
- atomic_add(nr_skipped, &lru_count);
+ if (!list_empty(&dispose))
+ gfs2_dispose_glock_lru(&dispose);
spin_unlock(&lru_lock);
-out:
+}
+
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (sc->nr_to_scan) {
+ if (!(sc->gfp_mask & __GFP_FS))
+ return -1;
+ gfs2_scan_glock_lru(sc->nr_to_scan);
+ }
+
return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78d4184ffc7..444b6503ebc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -322,8 +322,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
break;
};
- ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
- ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+ i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
+ i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c373a24fedd..156e42ec84e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,6 @@ struct gfs2_log_header_host {
*/
struct gfs2_log_operations {
- void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
void (*lo_before_commit) (struct gfs2_sbd *sdp);
void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -341,6 +340,7 @@ enum {
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
+ GIF_ORDERED = 4,
};
struct gfs2_inode {
@@ -357,6 +357,7 @@ struct gfs2_inode {
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
+ struct list_head i_ordered;
struct list_head i_trunc_list;
__be64 *i_hash_cache;
u32 i_entries;
@@ -391,7 +392,6 @@ struct gfs2_revoke_replay {
};
enum {
- QDF_USER = 0,
QDF_CHANGE = 1,
QDF_LOCKED = 2,
QDF_REFRESH = 3,
@@ -403,7 +403,7 @@ struct gfs2_quota_data {
atomic_t qd_count;
- u32 qd_id;
+ struct kqid qd_id;
unsigned long qd_flags; /* QDF_... */
s64 qd_change;
@@ -641,6 +641,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
+ struct completion sd_wdack;
struct delayed_work sd_control_work;
/* Inode Stuff */
@@ -723,6 +724,7 @@ struct gfs2_sbd {
struct list_head sd_log_le_revoke;
struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
+ spinlock_t sd_ordered_lock;
atomic_t sd_log_thresh1;
atomic_t sd_log_thresh2;
@@ -758,10 +760,7 @@ struct gfs2_sbd {
unsigned int sd_replayed_blocks;
/* For quiescing the filesystem */
-
struct gfs2_holder sd_freeze_gh;
- struct mutex sd_freeze_lock;
- unsigned int sd_freeze_count;
char sd_fsname[GFS2_FSNAME_LEN];
char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b6f5698ef1..cc00bd1d1f8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -368,10 +368,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
struct inode *inode)
{
if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
- (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
+ (dip->i_inode.i_mode & S_ISUID) &&
+ !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) {
if (S_ISDIR(inode->i_mode))
inode->i_mode |= S_ISUID;
- else if (dip->i_inode.i_uid != current_fsuid())
+ else if (!uid_eq(dip->i_inode.i_uid, current_fsuid()))
inode->i_mode &= ~07111;
inode->i_uid = dip->i_inode.i_uid;
} else
@@ -447,7 +448,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
struct timespec tv = CURRENT_TIME;
dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
di = (struct gfs2_dinode *)dibh->b_data;
@@ -455,8 +456,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
- di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+ di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+ di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
di->di_nlink = 0;
di->di_size = cpu_to_be64(ip->i_inode.i_size);
di->di_blocks = cpu_to_be64(1);
@@ -548,7 +549,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
return error;
- error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto fail;
@@ -584,7 +585,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
if (error)
goto fail_end_trans;
set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
return 0;
@@ -931,7 +932,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out_brelse;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
inc_nlink(&ip->i_inode);
ip->i_inode.i_ctime = CURRENT_TIME;
ihold(inode);
@@ -978,8 +979,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
return -EPERM;
if ((dip->i_inode.i_mode & S_ISVTX) &&
- dip->i_inode.i_uid != current_fsuid() &&
- ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+ !uid_eq(dip->i_inode.i_uid, current_fsuid()) &&
+ !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER))
return -EPERM;
if (IS_APPEND(&dip->i_inode))
@@ -1412,7 +1413,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
goto out_end_trans;
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -1580,7 +1581,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- u32 ouid, ogid, nuid, ngid;
+ kuid_t ouid, nuid;
+ kgid_t ogid, ngid;
int error;
ouid = inode->i_uid;
@@ -1588,16 +1590,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
nuid = attr->ia_uid;
ngid = attr->ia_gid;
- if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
- ouid = nuid = NO_QUOTA_CHANGE;
- if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
- ogid = ngid = NO_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid))
+ ouid = nuid = NO_UID_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
+ ogid = ngid = NO_GID_QUOTA_CHANGE;
error = gfs2_quota_lock(ip, nuid, ngid);
if (error)
return error;
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+ !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
error = gfs2_quota_check(ip, nuid, ngid);
if (error)
goto out_gunlock_q;
@@ -1611,7 +1614,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_end_trans;
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+ !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
gfs2_quota_change(ip, -blocks, ouid, ogid);
gfs2_quota_change(ip, blocks, nuid, ngid);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index b906ed17a83..9802de0f85e 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int lvb_needs_unlock = 0;
int error;
if (gl->gl_lksb.sb_lkid == 0) {
@@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
gfs2_update_request_times(gl);
/* don't want to skip dlm_unlock writing the lvb when lock is ex */
+
+ if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+ lvb_needs_unlock = 1;
+
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
- gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+ !lvb_needs_unlock) {
gfs2_glock_free(gl);
return;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f4beeb9c81c..9a2ca8be764 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp)
}
}
-static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
{
- struct gfs2_bufdata *bda, *bdb;
+ struct gfs2_inode *ipa, *ipb;
- bda = list_entry(a, struct gfs2_bufdata, bd_list);
- bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+ ipa = list_entry(a, struct gfs2_inode, i_ordered);
+ ipb = list_entry(b, struct gfs2_inode, i_ordered);
- if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ if (ipa->i_no_addr < ipb->i_no_addr)
return -1;
- if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ if (ipa->i_no_addr > ipb->i_no_addr)
return 1;
return 0;
}
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
- struct gfs2_bufdata *bd;
- struct buffer_head *bh;
+ struct gfs2_inode *ip;
LIST_HEAD(written);
- gfs2_log_lock(sdp);
- list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
+ spin_lock(&sdp->sd_ordered_lock);
+ list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
- list_move(&bd->bd_list, &written);
- bh = bd->bd_bh;
- if (!buffer_dirty(bh))
+ ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+ list_move(&ip->i_ordered, &written);
+ if (ip->i_inode.i_mapping->nrpages == 0)
continue;
- get_bh(bh);
- gfs2_log_unlock(sdp);
- lock_buffer(bh);
- if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE_SYNC, bh);
- } else {
- unlock_buffer(bh);
- brelse(bh);
- }
- gfs2_log_lock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
+ filemap_fdatawrite(ip->i_inode.i_mapping);
+ spin_lock(&sdp->sd_ordered_lock);
}
list_splice(&written, &sdp->sd_log_le_ordered);
- gfs2_log_unlock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
}
static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
{
- struct gfs2_bufdata *bd;
- struct buffer_head *bh;
+ struct gfs2_inode *ip;
- gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_le_ordered)) {
- bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
- bh = bd->bd_bh;
- if (buffer_locked(bh)) {
- get_bh(bh);
- gfs2_log_unlock(sdp);
- wait_on_buffer(bh);
- brelse(bh);
- gfs2_log_lock(sdp);
+ ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+ list_del(&ip->i_ordered);
+ WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+ if (ip->i_inode.i_mapping->nrpages == 0)
continue;
- }
- list_del_init(&bd->bd_list);
+ spin_unlock(&sdp->sd_ordered_lock);
+ filemap_fdatawait(ip->i_inode.i_mapping);
+ spin_lock(&sdp->sd_ordered_lock);
}
- gfs2_log_unlock(sdp);
+ spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_ordered_del_inode(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ spin_lock(&sdp->sd_ordered_lock);
+ if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
+ list_del(&ip->i_ordered);
+ spin_unlock(&sdp->sd_ordered_lock);
}
/**
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3fd5215ea25..3566f35915e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
sdp->sd_log_head = sdp->sd_log_tail = value;
}
+static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+ spin_lock(&sdp->sd_ordered_lock);
+ if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+ list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+ spin_unlock(&sdp->sd_ordered_lock);
+ }
+}
+extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9ceccb1595a..a5055977a21 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -37,7 +37,7 @@
*
* The log lock must be held when calling this function
*/
-static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
struct gfs2_bufdata *bd;
@@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
return page;
}
-static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_meta_header *mh;
- struct gfs2_trans *tr;
-
- tr = current->journal_info;
- tr->tr_touched = 1;
- if (!list_empty(&bd->bd_list))
- return;
- set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
- set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
- mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
- if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
- printk(KERN_ERR
- "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
- (unsigned long long)bd->bd_bh->b_blocknr);
- BUG();
- }
- gfs2_pin(sdp, bd->bd_bh);
- mh->__pad0 = cpu_to_be64(0);
- mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
- sdp->sd_log_num_buf++;
- list_add(&bd->bd_list, &sdp->sd_log_le_buf);
- tr->tr_num_buf_new++;
-}
-
static void gfs2_check_magic(struct buffer_head *bh)
{
void *kaddr;
@@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
}
-static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_glock *gl = bd->bd_gl;
- struct gfs2_trans *tr;
-
- tr = current->journal_info;
- tr->tr_touched = 1;
- tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
-}
-
static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
{
struct gfs2_meta_header *mh;
@@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
}
/**
- * databuf_lo_add - Add a databuf to the transaction.
- *
- * This is used in two distinct cases:
- * i) In ordered write mode
- * We put the data buffer on a list so that we can ensure that its
- * synced to disk at the right time
- * ii) In journaled data mode
- * We need to journal the data block in the same way as metadata in
- * the functions above. The difference is that here we have a tag
- * which is two __be64's being the block number (as per meta data)
- * and a flag which says whether the data block needs escaping or
- * not. This means we need a new log entry for each 251 or so data
- * blocks, which isn't an enormous overhead but twice as much as
- * for normal metadata blocks.
- */
-static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- struct gfs2_trans *tr = current->journal_info;
- struct address_space *mapping = bd->bd_bh->b_page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
-
- if (tr)
- tr->tr_touched = 1;
- if (!list_empty(&bd->bd_list))
- return;
- set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
- set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
- if (gfs2_is_jdata(ip)) {
- gfs2_pin(sdp, bd->bd_bh);
- tr->tr_num_databuf_new++;
- sdp->sd_log_num_databuf++;
- list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
- } else {
- list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
- }
-}
-
-/**
* databuf_lo_before_commit - Scan the data buffers, writing as we go
*
*/
@@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
const struct gfs2_log_operations gfs2_buf_lops = {
- .lo_add = buf_lo_add,
.lo_before_commit = buf_lo_before_commit,
.lo_after_commit = buf_lo_after_commit,
.lo_before_scan = buf_lo_before_scan,
@@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = {
};
const struct gfs2_log_operations gfs2_revoke_lops = {
- .lo_add = revoke_lo_add,
.lo_before_commit = revoke_lo_before_commit,
.lo_after_commit = revoke_lo_after_commit,
.lo_before_scan = revoke_lo_before_scan,
@@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
};
const struct gfs2_log_operations gfs2_databuf_lops = {
- .lo_add = databuf_lo_add,
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
.lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 954a330585f..ba77b7da832 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
@@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
return limit;
}
-static inline void lops_init_le(struct gfs2_bufdata *bd,
- const struct gfs2_log_operations *lops)
-{
- INIT_LIST_HEAD(&bd->bd_list);
- bd->bd_ops = lops;
-}
-
-static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
- if (bd->bd_ops->lo_add)
- bd->bd_ops->lo_add(sdp, bd);
-}
-
static inline void lops_before_commit(struct gfs2_sbd *sdp)
{
int x;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 22255d96b27..b059bbb5059 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
return 0;
}
-/**
- * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
- * @gl: the glock the buffer belongs to
- * @bh: The buffer to be attached to
- * @meta: Flag to indicate whether its metadata or not
- */
-
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
- int meta)
-{
- struct gfs2_bufdata *bd;
-
- if (meta)
- lock_page(bh->b_page);
-
- if (bh->b_private) {
- if (meta)
- unlock_page(bh->b_page);
- return;
- }
-
- bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
- bd->bd_bh = bh;
- bd->bd_gl = gl;
-
- if (meta)
- lops_init_le(bd, &gfs2_buf_lops);
- else
- lops_init_le(bd, &gfs2_databuf_lops);
- bh->b_private = bd;
-
- if (meta)
- unlock_page(bh->b_page);
-}
-
void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
{
struct address_space *mapping = bh->b_page->mapping;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c30973b07a7..0d4c843b6f8 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
- int meta);
-
void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
int meta);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0e3554edb8f..1b612be4b87 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
init_waitqueue_head(&sdp->sd_glock_wait);
atomic_set(&sdp->sd_glock_disposal, 0);
init_completion(&sdp->sd_locking_init);
+ init_completion(&sdp->sd_wdack);
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
@@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+ spin_lock_init(&sdp->sd_ordered_lock);
init_waitqueue_head(&sdp->sd_log_waitq);
init_waitqueue_head(&sdp->sd_logd_waitq);
@@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_revoke_list);
- mutex_init(&sdp->sd_freeze_lock);
-
return sdp;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index ae55e248c3b..c7c840e916f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -65,13 +65,10 @@
#include "inode.h"
#include "util.h"
-#define QUOTA_USER 1
-#define QUOTA_GROUP 0
-
struct gfs2_quota_change_host {
u64 qc_change;
u32 qc_flags; /* GFS2_QCF_... */
- u32 qc_id;
+ struct kqid qc_id;
};
static LIST_HEAD(qd_lru_list);
@@ -120,17 +117,24 @@ out:
return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
}
+static u64 qd2index(struct gfs2_quota_data *qd)
+{
+ struct kqid qid = qd->qd_id;
+ return (2 * (u64)from_kqid(&init_user_ns, qid)) +
+ (qid.type == USRQUOTA) ? 0 : 1;
+}
+
static u64 qd2offset(struct gfs2_quota_data *qd)
{
u64 offset;
- offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+ offset = qd2index(qd);
offset *= sizeof(struct gfs2_quota);
return offset;
}
-static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd;
@@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
return -ENOMEM;
atomic_set(&qd->qd_count, 1);
- qd->qd_id = id;
- if (user)
- set_bit(QDF_USER, &qd->qd_flags);
+ qd->qd_id = qid;
qd->qd_slot = -1;
INIT_LIST_HEAD(&qd->qd_reclaim);
- error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+ error = gfs2_glock_get(sdp, qd2index(qd),
&gfs2_quota_glops, CREATE, &qd->qd_gl);
if (error)
goto fail;
@@ -161,7 +163,7 @@ fail:
return error;
}
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
found = 0;
spin_lock(&qd_lru_lock);
list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- if (qd->qd_id == id &&
- !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+ if (qid_eq(qd->qd_id, qid)) {
if (!atomic_read(&qd->qd_count) &&
!list_empty(&qd->qd_reclaim)) {
/* Remove it from reclaim list */
@@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
return 0;
}
- error = qd_alloc(sdp, user, id, &new_qd);
+ error = qd_alloc(sdp, qid, &new_qd);
if (error)
return error;
}
@@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
qd_put(qd);
}
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
int error;
- error = qd_get(sdp, user, id, qdp);
+ error = qd_get(sdp, qid, qdp);
if (error)
return error;
@@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
qd_put(qd);
}
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data **qd;
@@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
+ error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
- error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
+ error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
- if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
- error = qdsb_get(sdp, QUOTA_USER, uid, qd);
+ if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
+ !uid_eq(uid, ip->i_inode.i_uid)) {
+ error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
qd++;
}
- if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
- error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
+ if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) &&
+ !gid_eq(gid, ip->i_inode.i_gid)) {
+ error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
goto out;
ip->i_res->rs_qa_qd_num++;
@@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b)
const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
- if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
- !test_bit(QDF_USER, &qd_b->qd_flags)) {
- if (test_bit(QDF_USER, &qd_a->qd_flags))
- return -1;
- else
- return 1;
- }
- if (qd_a->qd_id < qd_b->qd_id)
+ if (qid_lt(qd_a->qd_id, qd_b->qd_id))
return -1;
- if (qd_a->qd_id > qd_b->qd_id)
+ if (qid_lt(qd_b->qd_id, qd_a->qd_id))
return 1;
-
return 0;
}
@@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
s64 x;
mutex_lock(&sdp->sd_quota_mutex);
- gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
qc->qc_change = 0;
qc->qc_flags = 0;
- if (test_bit(QDF_USER, &qd->qd_flags))
+ if (qd->qd_id.type == USRQUOTA)
qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
- qc->qc_id = cpu_to_be32(qd->qd_id);
+ qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));
}
x = be64_to_cpu(qc->qc_change) + change;
@@ -726,7 +721,7 @@ get_a_page:
goto unlock_out;
}
- gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ gfs2_trans_add_meta(ip->i_gl, bh);
kaddr = kmap_atomic(page);
if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -925,7 +920,7 @@ fail:
return error;
}
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
@@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
sdp->sd_fsname, type,
- (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
- qd->qd_id);
+ (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+ from_kqid(&init_user_ns, qd->qd_id));
return 0;
}
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
@@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
qd = ip->i_res->rs_qa_qd[x];
- if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
- (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+ if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+ qid_eq(qd->qd_id, make_kqid_gid(gid))))
continue;
value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
@@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
print_message(qd, "exceeded");
- quota_send_warning(make_kqid(&init_user_ns,
- test_bit(QDF_USER, &qd->qd_flags) ?
- USRQUOTA : GRPQUOTA,
- qd->qd_id),
+ quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
error = -EDQUOT;
@@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
time_after_eq(jiffies, qd->qd_last_warn +
gfs2_tune_get(sdp,
gt_quota_warn_period) * HZ)) {
- quota_send_warning(make_kqid(&init_user_ns,
- test_bit(QDF_USER, &qd->qd_flags) ?
- USRQUOTA : GRPQUOTA,
- qd->qd_id),
+ quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
error = print_message(qd, "warning");
qd->qd_last_warn = jiffies;
@@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
}
void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- u32 uid, u32 gid)
+ kuid_t uid, kgid_t gid)
{
struct gfs2_quota_data *qd;
unsigned int x;
@@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
qd = ip->i_res->rs_qa_qd[x];
- if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
- (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+ if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+ qid_eq(qd->qd_id, make_kqid_gid(gid))) {
do_qc(qd, change);
}
}
@@ -1170,13 +1159,13 @@ static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
return gfs2_quota_sync(sb, type);
}
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
struct gfs2_holder q_gh;
int error;
- error = qd_get(sdp, user, id, &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
@@ -1194,7 +1183,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
qc->qc_change = be64_to_cpu(str->qc_change);
qc->qc_flags = be32_to_cpu(str->qc_flags);
- qc->qc_id = be32_to_cpu(str->qc_id);
+ qc->qc_id = make_kqid(&init_user_ns,
+ (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
+ be32_to_cpu(str->qc_id));
}
int gfs2_quota_init(struct gfs2_sbd *sdp)
@@ -1257,8 +1248,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
if (!qc.qc_change)
continue;
- error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
- qc.qc_id, &qd);
+ error = qd_alloc(sdp, qc.qc_id, &qd);
if (error) {
brelse(bh);
goto fail;
@@ -1485,21 +1475,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
struct gfs2_quota_data *qd;
struct gfs2_holder q_gh;
int error;
- int type;
memset(fdq, 0, sizeof(struct fs_disk_quota));
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return -ESRCH; /* Crazy XFS error code */
- if (qid.type == USRQUOTA)
- type = QUOTA_USER;
- else if (qid.type == GRPQUOTA)
- type = QUOTA_GROUP;
- else
+ if ((qid.type != USRQUOTA) &&
+ (qid.type != GRPQUOTA))
return -EINVAL;
- error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
error = do_glock(qd, FORCE, &q_gh);
@@ -1508,8 +1494,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
fdq->d_version = FS_DQUOT_VERSION;
- fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
- fdq->d_id = from_kqid(&init_user_ns, qid);
+ fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+ fdq->d_id = from_kqid_munged(current_user_ns(), qid);
fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
@@ -1535,32 +1521,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
int alloc_required;
loff_t offset;
int error;
- int type;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return -ESRCH; /* Crazy XFS error code */
- switch(qid.type) {
- case USRQUOTA:
- type = QUOTA_USER;
- if (fdq->d_flags != FS_USER_QUOTA)
- return -EINVAL;
- break;
- case GRPQUOTA:
- type = QUOTA_GROUP;
- if (fdq->d_flags != FS_GROUP_QUOTA)
- return -EINVAL;
- break;
- default:
+ if ((qid.type != USRQUOTA) &&
+ (qid.type != GRPQUOTA))
return -EINVAL;
- }
if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
return -EINVAL;
- if (fdq->d_id != from_kqid(&init_user_ns, qid))
- return -EINVAL;
- error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+ error = qd_get(sdp, qid, &qd);
if (error)
return error;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index f25d98b8790..4f5e6e44ed8 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -14,20 +14,21 @@ struct gfs2_inode;
struct gfs2_sbd;
struct shrink_control;
-#define NO_QUOTA_CHANGE ((u32)-1)
+#define NO_UID_QUOTA_CHANGE INVALID_UID
+#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- u32 uid, u32 gid);
+ kuid_t uid, kgid_t gid);
extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
@@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
int ret;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 04af1cf7ae3..d1f51fd73f8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1323,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (ret == 0) {
bh = rgd->rd_bits[0].bi_bh;
rgd->rd_flags |= GFS2_RGF_TRIMMED;
- gfs2_trans_add_bh(rgd->rd_gl, bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, bh);
gfs2_rgrp_out(rgd, bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
gfs2_trans_end(sdp);
@@ -1968,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
*n = 1;
block = gfs2_rbm_to_block(rbm);
- gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
+ gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
block++;
while (*n < elen) {
ret = gfs2_rbm_from_block(&pos, block);
if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
break;
- gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
+ gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
gfs2_setbit(&pos, true, GFS2_BLKST_USED);
(*n)++;
block++;
@@ -2014,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
rbm.bi->bi_len);
}
- gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
gfs2_setbit(&rbm, false, new_state);
}
@@ -2157,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
if (error == 0) {
struct gfs2_dinode *di =
(struct gfs2_dinode *)dibh->b_data;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
di->di_goal_meta = di->di_goal_data =
cpu_to_be64(ip->i_goal);
brelse(dibh);
@@ -2176,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
*generation = rbm.rgd->rd_igeneration++;
}
- gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
@@ -2223,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
rgd->rd_free += blen;
rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
@@ -2260,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode)
if (!rgd)
return;
trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
update_rgrp_lvb_unlinked(rgd, 1);
@@ -2281,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
rgd->rd_dinodes--;
rgd->rd_free++;
- gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
update_rgrp_lvb_unlinked(rgd, -1);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d6488674d91..cab77b8ba84 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
if (error)
return;
- gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ gfs2_trans_add_meta(l_ip->i_gl, l_bh);
spin_lock(&sdp->sd_statfs_spin);
l_sc->sc_total += total;
@@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
- gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ gfs2_trans_add_meta(l_ip->i_gl, l_bh);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
@@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
0, sizeof(struct gfs2_statfs_change));
spin_unlock(&sdp->sd_statfs_spin);
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+ gfs2_trans_add_meta(m_ip->i_gl, m_bh);
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
}
@@ -663,54 +663,6 @@ out:
return error;
}
-/**
- * gfs2_freeze_fs - freezes the file system
- * @sdp: the file system
- *
- * This function flushes data and meta data for all machines by
- * acquiring the transaction log exclusively. All journals are
- * ensured to be in a clean state as well.
- *
- * Returns: errno
- */
-
-int gfs2_freeze_fs(struct gfs2_sbd *sdp)
-{
- int error = 0;
-
- mutex_lock(&sdp->sd_freeze_lock);
-
- if (!sdp->sd_freeze_count++) {
- error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
- if (error)
- sdp->sd_freeze_count--;
- }
-
- mutex_unlock(&sdp->sd_freeze_lock);
-
- return error;
-}
-
-/**
- * gfs2_unfreeze_fs - unfreezes the file system
- * @sdp: the file system
- *
- * This function allows the file system to proceed by unlocking
- * the exclusively held transaction lock. Other GFS2 nodes are
- * now free to acquire the lock shared and go on with their lives.
- *
- */
-
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
-{
- mutex_lock(&sdp->sd_freeze_lock);
-
- if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
- gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-
- mutex_unlock(&sdp->sd_freeze_lock);
-}
-
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
struct gfs2_dinode *str = buf;
@@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
- str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+ str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+ str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
ret = gfs2_meta_inode_buffer(ip, &bh);
if (ret == 0) {
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
}
@@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb)
int error;
struct gfs2_jdesc *jd;
- /* Unfreeze the filesystem, if we need to */
-
- mutex_lock(&sdp->sd_freeze_lock);
- if (sdp->sd_freeze_count)
- gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
- mutex_unlock(&sdp->sd_freeze_lock);
-
/* No more recovery requests */
set_bit(SDF_NORECOVERY, &sdp->sd_flags);
smp_mb();
@@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb)
return -EINVAL;
for (;;) {
- error = gfs2_freeze_fs(sdp);
+ error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
if (!error)
break;
@@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb)
static int gfs2_unfreeze(struct super_block *sb)
{
- gfs2_unfreeze_fs(sb->s_fs_info);
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
return 0;
}
@@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
return error;
@@ -1577,6 +1524,7 @@ out:
/* Case 3 starts here */
truncate_inode_pages(&inode->i_data, 0);
gfs2_rs_delete(ip);
+ gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
ip->i_gl->gl_object = NULL;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index a0464680af0..90e3322ffa1 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
-
extern struct file_system_type gfs2_fs_type;
extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0acbe2ff1e5..aa5c4804496 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -91,19 +91,15 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
{
- unsigned int count;
-
- mutex_lock(&sdp->sd_freeze_lock);
- count = sdp->sd_freeze_count;
- mutex_unlock(&sdp->sd_freeze_lock);
+ struct super_block *sb = sdp->sd_vfs;
+ int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
- return snprintf(buf, PAGE_SIZE, "%u\n", count);
+ return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
}
static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
- ssize_t ret = len;
- int error = 0;
+ int error;
int n = simple_strtol(buf, NULL, 0);
if (!capable(CAP_SYS_ADMIN))
@@ -111,19 +107,21 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
switch (n) {
case 0:
- gfs2_unfreeze_fs(sdp);
+ error = thaw_super(sdp->sd_vfs);
break;
case 1:
- error = gfs2_freeze_fs(sdp);
+ error = freeze_super(sdp->sd_vfs);
break;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
- if (error)
+ if (error) {
fs_warn(sdp, "freeze %d error %d", n, error);
+ return error;
+ }
- return ret;
+ return len;
}
static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
@@ -175,6 +173,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ struct kqid qid;
int error;
u32 id;
@@ -183,13 +182,18 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- error = gfs2_quota_refresh(sdp, 1, id);
+ qid = make_kqid(current_user_ns(), USRQUOTA, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+
+ error = gfs2_quota_refresh(sdp, qid);
return error ? error : len;
}
static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ struct kqid qid;
int error;
u32 id;
@@ -198,7 +202,11 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- error = gfs2_quota_refresh(sdp, 0, id);
+ qid = make_kqid(current_user_ns(), GRPQUOTA, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+
+ error = gfs2_quota_refresh(sdp, qid);
return error ? error : len;
}
@@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
return ret;
}
+static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
+{
+ int val = completion_done(&sdp->sd_wdack) ? 1 : 0;
+
+ return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int val;
+
+ val = simple_strtol(buf, NULL, 0);
+
+ if ((val == 1) &&
+ !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+ complete(&sdp->sd_wdack);
+ else
+ ret = -EINVAL;
+ return ret;
+}
+
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
GDLM_ATTR(block, 0644, block_show, block_store);
-GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store);
GDLM_ATTR(jid, 0644, jid_show, jid_store);
GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store);
GDLM_ATTR(first_done, 0444, first_done_show, NULL);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 413627072f3..88162fae27a 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -18,6 +18,7 @@
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
+#include "inode.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
@@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
sb_end_intwrite(sdp->sd_vfs);
}
+static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
+ struct buffer_head *bh,
+ const struct gfs2_log_operations *lops)
+{
+ struct gfs2_bufdata *bd;
+
+ bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+ bd->bd_bh = bh;
+ bd->bd_gl = gl;
+ bd->bd_ops = lops;
+ INIT_LIST_HEAD(&bd->bd_list);
+ bh->b_private = bd;
+ return bd;
+}
+
/**
- * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
- * @gl: the glock the buffer belongs to
+ * gfs2_trans_add_data - Add a databuf to the transaction.
+ * @gl: The inode glock associated with the buffer
* @bh: The buffer to add
- * @meta: True in the case of adding metadata
*
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ * We put the data buffer on a list so that we can ensure that its
+ * synced to disk at the right time
+ * ii) In journaled data mode
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
*/
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+ struct gfs2_trans *tr = current->journal_info;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct address_space *mapping = bh->b_page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_bufdata *bd;
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+ if (!gfs2_is_jdata(ip)) {
+ gfs2_ordered_add_inode(ip);
+ return;
+ }
+
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
+ bd = bh->b_private;
+ if (bd == NULL) {
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
+ if (bh->b_private == NULL)
+ bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
+ }
+ gfs2_assert(sdp, bd->bd_gl == gl);
+ tr->tr_touched = 1;
+ if (list_empty(&bd->bd_list)) {
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+ gfs2_pin(sdp, bd->bd_bh);
+ tr->tr_num_databuf_new++;
+ sdp->sd_log_num_databuf++;
+ list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
+ }
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
+}
+
+static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
+ struct gfs2_meta_header *mh;
+ struct gfs2_trans *tr;
+
+ tr = current->journal_info;
+ tr->tr_touched = 1;
+ if (!list_empty(&bd->bd_list))
+ return;
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+ mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+ if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+ printk(KERN_ERR
+ "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+ (unsigned long long)bd->bd_bh->b_blocknr);
+ BUG();
+ }
+ gfs2_pin(sdp, bd->bd_bh);
+ mh->__pad0 = cpu_to_be64(0);
+ mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+ sdp->sd_log_num_buf++;
+ list_add(&bd->bd_list, &sdp->sd_log_le_buf);
+ tr->tr_num_buf_new++;
+}
+
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_bufdata *bd;
lock_buffer(bh);
gfs2_log_lock(sdp);
bd = bh->b_private;
- if (bd)
- gfs2_assert(sdp, bd->bd_gl == gl);
- else {
+ if (bd == NULL) {
gfs2_log_unlock(sdp);
unlock_buffer(bh);
- gfs2_attach_bufdata(gl, bh, meta);
- bd = bh->b_private;
+ lock_page(bh->b_page);
+ if (bh->b_private == NULL)
+ bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
}
- lops_add(sdp, bd);
+ gfs2_assert(sdp, bd->bd_gl == gl);
+ meta_lo_add(sdp, bd);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
}
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
+ struct gfs2_glock *gl = bd->bd_gl;
+ struct gfs2_trans *tr = current->journal_info;
+
BUG_ON(!list_empty(&bd->bd_list));
BUG_ON(!list_empty(&bd->bd_ail_st_list));
BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- lops_init_le(bd, &gfs2_revoke_lops);
- lops_add(sdp, bd);
+ bd->bd_ops = &gfs2_revoke_lops;
+ tr->tr_touched = 1;
+ tr->tr_num_revoke++;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index bf2ae9aeee7..1e6e7da25a1 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
unsigned int revokes);
extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f00d7c5744f..6402fb69d71 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+ if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+ wait_for_completion(&sdp->sd_wdack);
+
if (lm->lm_unmount) {
fs_err(sdp, "telling LM to unmount\n");
lm->lm_unmount(sdp);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 76c144b3c9b..ecd37f30ab9 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out_gunlock;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
dataptrs = GFS2_EA2DATAPTRS(ea);
for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
@@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
return error;
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (error)
goto out_alloc;
@@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
}
if (din) {
- gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+ gfs2_trans_add_meta(ip->i_gl, bh[x]);
memcpy(pos, din, cp_size);
din += sdp->sd_jbsize;
}
@@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
*bhp = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+ gfs2_trans_add_meta(ip->i_gl, *bhp);
gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
@@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
return error;
gfs2_trans_add_unrevoke(sdp, block, 1);
bh = gfs2_meta_new(ip->i_gl, block);
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
gfs2_add_inode_blocks(&ip->i_inode, 1);
@@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
}
@@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip,
struct gfs2_ea_header *prev = el->el_prev;
u32 len;
- gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, el->el_bh);
if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
ea->ea_type = GFS2_EATYPE_UNUSED;
@@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, bh);
if (es->ea_split)
ea = ea_split_ea(ea);
@@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (error)
goto out;
ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
out:
@@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip,
struct gfs2_ea_header *ea = es->es_ea;
int error;
- gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+ gfs2_trans_add_meta(ip->i_gl, es->es_bh);
if (es->ea_split)
ea = ea_split_ea(ea);
@@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out;
}
- gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_trans_add_meta(ip->i_gl, indbh);
} else {
u64 blk;