Merge commit 'v2.6.37-rc1' into kbuild/kbuild

author: Michal Marek <mmarek@suse.cz> 2010-12-14 22:01:55 +0100
committer: Michal Marek <mmarek@suse.cz> 2010-12-14 22:01:55 +0100
commit: 8990c1bc4be46473ad19bf2fa612ca57286f3df4 (patch)
tree: 3cea60576903a1d26c67e6ec62891b524d390e95 /fs
parent: 2979076fbf17a0947d6eba367b0cac19c907c160 (diff)
parent: c8ddb2713c624f432fa5fe3c7ecffcdda46ea0d4 (diff)
897 files changed, 40895 insertions, 47855 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4..7e051147679 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
 	  Choose Y here to enable persistent, read-only local
 	  caching support for 9p clients using FS-Cache
 
+
+config 9P_FS_POSIX_ACL
+	bool "9P POSIX Access Control Lists"
+	depends on 9P_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af6..f8ba37effd1 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
 	vfs_dir.o \
 	vfs_dentry.o \
 	v9fs.o \
-	fid.o
+	fid.o  \
+	xattr.o \
+	xattr_user.o
 
 9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 00000000000..12d602351db
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs_vfs.h"
+#include "v9fs.h"
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+	ssize_t size;
+	void *value = NULL;
+	struct posix_acl *acl = NULL;;
+
+	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = v9fs_fid_xattr_get(fid, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl))
+				goto err_out;
+		}
+	} else if (size == -ENODATA || size == 0 ||
+		   size == -ENOSYS || size == -EOPNOTSUPP) {
+		acl = NULL;
+	} else
+		acl = ERR_PTR(-EIO);
+
+err_out:
+	kfree(value);
+	return acl;
+}
+
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	int retval = 0;
+	struct posix_acl *pacl, *dacl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+		return 0;
+	}
+	/* get the default/access acl values and cache them */
+	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+
+	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+		posix_acl_release(dacl);
+		posix_acl_release(pacl);
+	} else
+		retval = -EIO;
+
+	return retval;
+}
+
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	/*
+	 * 9p Always cache the acl value when
+	 * instantiating the inode (v9fs_inode_from_fid)
+	 */
+	acl = get_cached_acl(inode, type);
+	BUG_ON(acl == ACL_NOT_CACHED);
+	return acl;
+}
+
+int v9fs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		/*
+		 * On access = client mode get the acl
+		 * values from the server
+		 */
+		return 0;
+	}
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+	return -EAGAIN;
+}
+
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+	int retval;
+	char *name;
+	size_t size;
+	void *buffer;
+	struct inode *inode = dentry->d_inode;
+
+	set_cached_acl(inode, type, acl);
+	/* Set a setxattr request to server */
+	size = posix_acl_xattr_size(acl->a_count);
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+	retval = posix_acl_to_xattr(acl, buffer, size);
+	if (retval < 0)
+		goto err_free_out;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+	kfree(buffer);
+	return retval;
+}
+
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+	int retval = 0;
+	struct posix_acl *acl, *clone;
+	struct inode *inode = dentry->d_inode;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+		retval = posix_acl_chmod_masq(clone, inode->i_mode);
+		if (!retval)
+			retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+		posix_acl_release(clone);
+	}
+	return retval;
+}
+
+int v9fs_set_create_acl(struct dentry *dentry,
+			struct posix_acl *dpacl, struct posix_acl *pacl)
+{
+	if (dpacl)
+		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+	if (pacl)
+		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+	posix_acl_release(dpacl);
+	posix_acl_release(pacl);
+	return 0;
+}
+
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+		  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+	int retval = 0;
+	mode_t mode = *modep;
+	struct posix_acl *acl = NULL;
+
+	if (!S_ISLNK(mode)) {
+		acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			mode &= ~current_umask();
+	}
+	if (acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(mode))
+			*dpacl = acl;
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		retval = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		retval = posix_acl_create_masq(clone, &mode);
+		if (retval < 0) {
+			posix_acl_release(clone);
+			goto cleanup;
+		}
+		if (retval > 0)
+			*pacl = clone;
+	}
+	*modep  = mode;
+	return 0;
+cleanup:
+	posix_acl_release(acl);
+	return retval;
+
+}
+
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+			       void *buffer, size_t size, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+			      void *buffer, size_t size, int type)
+{
+	struct v9fs_session_info *v9ses;
+	struct posix_acl *acl;
+	int error;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * We allow set/get/list of acl when access=client is not specified
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+
+	acl = v9fs_get_cached_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	char *full_name;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		full_name =  POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		full_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+			      const void *value, size_t size,
+			      int flags, int type)
+{
+	int retval;
+	struct posix_acl *acl;
+	struct v9fs_session_info *v9ses;
+	struct inode *inode = dentry->d_inode;
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_remote_set_acl(dentry, name,
+					   value, size, flags, type);
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+	if (value) {
+		/* update the cached acl value */
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			retval = posix_acl_valid(acl);
+			if (retval)
+				goto err_out;
+		}
+	} else
+		acl = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			retval = posix_acl_equiv_mode(acl, &mode);
+			if (retval < 0)
+				goto err_out;
+			else {
+				struct iattr iattr;
+				if (retval == 0) {
+					/*
+					 * ACL can be represented
+					 * by the mode bits. So don't
+					 * update ACL.
+					 */
+					acl = NULL;
+					value = NULL;
+					size = 0;
+				}
+				/* Updte the mode bits */
+				iattr.ia_mode = ((mode & S_IALLUGO) |
+						 (inode->i_mode & ~S_IALLUGO));
+				iattr.ia_valid = ATTR_MODE;
+				/* FIXME should we update ctime ?
+				 * What is the following setxattr update the
+				 * mode ?
+				 */
+				v9fs_vfs_setattr_dotl(dentry, &iattr);
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode)) {
+			retval = -EINVAL;
+			goto err_out;
+		}
+		break;
+	default:
+		BUG();
+	}
+	retval = v9fs_xattr_set(dentry, name, value, size, flags);
+	if (!retval)
+		set_cached_acl(inode, type, acl);
+err_out:
+	posix_acl_release(acl);
+	return retval;
+}
+
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
+
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.get	= v9fs_xattr_get_acl,
+	.set	= v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 00000000000..59e18c2e8c7
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+			       struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+			 struct posix_acl **dpacl, struct posix_acl **pacl);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+	return 0;
+}
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+	return 0;
+}
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+				      struct posix_acl *dpacl,
+				      struct posix_acl *pacl)
+{
+	return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+				struct posix_acl **dpacl,
+				struct posix_acl **pacl)
+{
+	return 0;
+}
+
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b281..b00223c99d7 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
 	return ret;
 }
 
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+				  struct dentry *dentry, char ***names)
+{
+	int n = 0, i;
+	char **wnames;
+	struct dentry *ds;
+
+	for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+		n++;
+
+	wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+	if (!wnames)
+		goto err_out;
+
+	for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+		wnames[i] = (char  *)ds->d_name.name;
+
+	*names = wnames;
+	return n;
+err_out:
+	return -ENOMEM;
+}
+
 /**
  * v9fs_fid_lookup - lookup for a fid, try to walk if not found
  * @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	int i, n, l, clone, any, access;
 	u32 uid;
 	struct p9_fid *fid, *old_fid = NULL;
-	struct dentry *d, *ds;
+	struct dentry *ds;
 	struct v9fs_session_info *v9ses;
 	char **wnames, *uname;
 
@@ -121,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	switch (access) {
 	case V9FS_ACCESS_SINGLE:
 	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
 		uid = current_fsuid();
 		any = 0;
 		break;
@@ -139,49 +168,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	fid = v9fs_fid_find(dentry, uid, any);
 	if (fid)
 		return fid;
-
+	/*
+	 * we don't have a matching fid. To do a TWALK we need
+	 * parent fid. We need to prevent rename when we want to
+	 * look at the parent.
+	 */
+	down_read(&v9ses->rename_sem);
 	ds = dentry->d_parent;
 	fid = v9fs_fid_find(ds, uid, any);
-	if (!fid) { /* walk from the root */
-		n = 0;
-		for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
-			n++;
+	if (fid) {
+		/* Found the parent fid do a lookup with that */
+		fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
+		goto fid_out;
+	}
+	up_read(&v9ses->rename_sem);
 
-		fid = v9fs_fid_find(ds, uid, any);
-		if (!fid) { /* the user is not attached to the fs yet */
-			if (access == V9FS_ACCESS_SINGLE)
-				return ERR_PTR(-EPERM);
+	/* start from the root and try to do a lookup */
+	fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+	if (!fid) {
+		/* the user is not attached to the fs yet */
+		if (access == V9FS_ACCESS_SINGLE)
+			return ERR_PTR(-EPERM);
 
-			if (v9fs_proto_dotu(v9ses))
+		if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
 				uname = NULL;
-			else
-				uname = v9ses->uname;
+		else
+			uname = v9ses->uname;
 
-			fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
-				v9ses->aname);
-
-			if (IS_ERR(fid))
-				return fid;
-
-			v9fs_fid_add(ds, fid);
-		}
-	} else /* walk from the parent */
-		n = 1;
+		fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+				       v9ses->aname);
+		if (IS_ERR(fid))
+			return fid;
 
-	if (ds == dentry)
+		v9fs_fid_add(dentry->d_sb->s_root, fid);
+	}
+	/* If we are root ourself just return that */
+	if (dentry->d_sb->s_root == dentry)
 		return fid;
-
-	wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
-	if (!wnames)
-		return ERR_PTR(-ENOMEM);
-
-	for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent)
-		wnames[i] = (char *) d->d_name.name;
-
+	/*
+	 * Do a multipath walk with attached root.
+	 * When walking parent we need to make sure we
+	 * don't have a parallel rename happening
+	 */
+	down_read(&v9ses->rename_sem);
+	n  = build_path_from_dentry(v9ses, dentry, &wnames);
+	if (n < 0) {
+		fid = ERR_PTR(n);
+		goto err_out;
+	}
 	clone = 1;
 	i = 0;
 	while (i < n) {
 		l = min(n - i, P9_MAXWELEM);
+		/*
+		 * We need to hold rename lock when doing a multipath
+		 * walk to ensure none of the patch component change
+		 */
 		fid = p9_client_walk(fid, l, &wnames[i], clone);
 		if (IS_ERR(fid)) {
 			if (old_fid) {
@@ -193,15 +235,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 				p9_client_clunk(old_fid);
 			}
 			kfree(wnames);
-			return fid;
+			goto err_out;
 		}
 		old_fid = fid;
 		i += l;
 		clone = 0;
 	}
-
 	kfree(wnames);
-	v9fs_fid_add(dentry, fid);
+fid_out:
+	if (!IS_ERR(fid))
+		v9fs_fid_add(dentry, fid);
+err_out:
+	up_read(&v9ses->rename_sem);
 	return fid;
 }
 
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd6..2f77cd33ba8 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				v9ses->flags |= V9FS_ACCESS_USER;
 			else if (strcmp(s, "any") == 0)
 				v9ses->flags |= V9FS_ACCESS_ANY;
-			else {
+			else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+				v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					"access=client option not supported\n");
+				kfree(s);
+				ret = -EINVAL;
+				goto free_and_return;
+#endif
+			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
 				v9ses->uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0')
@@ -237,6 +247,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		__putname(v9ses->uname);
 		return ERR_PTR(-ENOMEM);
 	}
+	init_rwsem(&v9ses->rename_sem);
 
 	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
 	if (rc) {
@@ -277,8 +288,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
+	if (!v9fs_proto_dotl(v9ses) &&
+	    ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACCESS_CLIENT only for dotl.
+		 * Fall back to ACCESS_USER
+		 */
+		v9ses->flags &= ~V9FS_ACCESS_MASK;
+		v9ses->flags |= V9FS_ACCESS_USER;
+	}
+	/*FIXME !! */
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
-	if (!v9fs_proto_dotu(v9ses) &&
+	if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
 		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
 
 		v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb45..cb6396855e2 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
  *
  * Session flags reflect options selected by users at mount time
  */
+#define	V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+			 V9FS_ACCESS_USER |   \
+			 V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+
 enum p9_session_flags {
 	V9FS_PROTO_2000U	= 0x01,
 	V9FS_PROTO_2000L	= 0x02,
 	V9FS_ACCESS_SINGLE	= 0x04,
 	V9FS_ACCESS_USER	= 0x08,
-	V9FS_ACCESS_ANY		= 0x0C,
-	V9FS_ACCESS_MASK	= 0x0C,
+	V9FS_ACCESS_CLIENT	= 0x10
 };
 
 /* possible values of ->cache */
@@ -104,6 +108,7 @@ struct v9fs_session_info {
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct backing_dev_info bdi;
+	struct rw_semaphore rename_sem;
 };
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
@@ -112,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 
-#define V9FS_MAGIC 0x01021997
-
 /* other default globals */
 #define V9FS_PORT	564
 #define V9FS_DEFUSER	"nobody"
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 32ef4009d03..bab0eac873f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,9 +52,10 @@ void v9fs_destroy_inode(struct inode *inode);
 #endif
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
-void v9fs_clear_inode(struct inode *inode);
+void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -63,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended);
 
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+
+#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b..b7f2a8e3863 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
 	return 0;
 }
 
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+		loff_t pos, unsigned long nr_segs)
+{
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+			"off/no(%lld/%lu) EINVAL\n",
+			iocb->ki_filp->f_path.dentry->d_name.name,
+			(long long) pos, nr_segs);
+
+	return -EINVAL;
+}
 const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
       .readpages = v9fs_vfs_readpages,
       .releasepage = v9fs_release_page,
       .invalidatepage = v9fs_invalidate_page,
       .launder_page = v9fs_launder_page,
+      .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 36d961f342a..b84ebe8cefe 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
 }
 
 /**
- * v9fs_dir_readdir - read a directory
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
  * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * @buflen: Length in bytes of buffer to allocate
  *
  */
 
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-	int over;
-	struct p9_wstat st;
-	int err = 0;
-	struct p9_fid *fid;
-	int buflen;
-	int reclen = 0;
 	struct p9_rdir *rdir;
+	struct p9_fid *fid;
+	int err = 0;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
 	fid = filp->private_data;
-
-	buflen = fid->clnt->msize - P9_IOHDRSZ;
-
-	/* allocate rdir on demand */
 	if (!fid->rdir) {
 		rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
 
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		spin_unlock(&filp->f_dentry->d_lock);
 		kfree(rdir);
 	}
+exit:
+	return err;
+}
+
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filp: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	int over;
+	struct p9_wstat st;
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	int reclen = 0;
+	struct p9_rdir *rdir;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	fid = filp->private_data;
+
+	buflen = fid->clnt->msize - P9_IOHDRSZ;
+
+	err = v9fs_alloc_rdir_buf(filp, buflen);
+	if (err)
+		goto exit;
 	rdir = (struct p9_rdir *) fid->rdir;
 
 	err = mutex_lock_interruptible(&rdir->mutex);
@@ -176,6 +196,89 @@ exit:
 	return err;
 }
 
+/**
+ * v9fs_dir_readdir_dotl - read a directory
+ * @filp: opened file structure
+ * @dirent: buffer to fill dirent structures
+ * @filldir: function to populate dirent structures
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+						filldir_t filldir)
+{
+	int over;
+	int err = 0;
+	struct p9_fid *fid;
+	int buflen;
+	struct p9_rdir *rdir;
+	struct p9_dirent curdirent;
+	u64 oldoffset = 0;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+	fid = filp->private_data;
+
+	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+
+	err = v9fs_alloc_rdir_buf(filp, buflen);
+	if (err)
+		goto exit;
+	rdir = (struct p9_rdir *) fid->rdir;
+
+	err = mutex_lock_interruptible(&rdir->mutex);
+	if (err)
+		return err;
+
+	while (err == 0) {
+		if (rdir->tail == rdir->head) {
+			err = p9_client_readdir(fid, rdir->buf, buflen,
+								filp->f_pos);
+			if (err <= 0)
+				goto unlock_and_exit;
+
+			rdir->head = 0;
+			rdir->tail = err;
+		}
+
+		while (rdir->head < rdir->tail) {
+
+			err = p9dirent_read(rdir->buf + rdir->head,
+						rdir->tail - rdir->head,
+						&curdirent,
+						fid->clnt->proto_version);
+			if (err < 0) {
+				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				err = -EIO;
+				goto unlock_and_exit;
+			}
+
+			/* d_off in dirent structure tracks the offset into
+			 * the next dirent in the dir. However, filldir()
+			 * expects offset into the current dirent. Hence
+			 * while calling filldir send the offset from the
+			 * previous dirent structure.
+			 */
+			over = filldir(dirent, curdirent.d_name,
+					strlen(curdirent.d_name),
+					oldoffset, v9fs_qid2ino(&curdirent.qid),
+					curdirent.d_type);
+			oldoffset = curdirent.d_off;
+
+			if (over) {
+				err = 0;
+				goto unlock_and_exit;
+			}
+
+			filp->f_pos = curdirent.d_off;
+			rdir->head += err;
+		}
+	}
+
+unlock_and_exit:
+	mutex_unlock(&rdir->mutex);
+exit:
+	return err;
+}
+
 
 /**
  * v9fs_dir_release - close a directory
@@ -190,9 +293,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 
 	fid = filp->private_data;
 	P9_DPRINTK(P9_DEBUG_VFS,
-			"inode: %p filp: %p fid: %d\n", inode, filp, fid->fid);
+			"v9fs_dir_release: inode: %p filp: %p fid: %d\n",
+			inode, filp, fid ? fid->fid : -1);
 	filemap_write_and_wait(inode->i_mapping);
-	p9_client_clunk(fid);
+	if (fid)
+		p9_client_clunk(fid);
 	return 0;
 }
 
@@ -207,7 +312,8 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir,
+	.readdir = v9fs_dir_readdir_dotl,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2bedc6c94fc..240c3067439 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -44,6 +45,7 @@
 #include "cache.h"
 
 static const struct file_operations v9fs_cached_file_operations;
+static const struct file_operations v9fs_cached_file_operations_dotl;
 
 /**
  * v9fs_file_open - open a file (or directory)
@@ -59,9 +61,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	struct p9_fid *fid;
 	int omode;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
+	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
 	v9ses = v9fs_inode2v9ses(inode);
-	omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
+	if (v9fs_proto_dotl(v9ses))
+		omode = file->f_flags;
+	else
+		omode = v9fs_uflags2omode(file->f_flags,
+					v9fs_proto_dotu(v9ses));
 	fid = file->private_data;
 	if (!fid) {
 		fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +79,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			p9_client_clunk(fid);
 			return err;
 		}
-		if (omode & P9_OTRUNC) {
+		if (file->f_flags & O_TRUNC) {
 			i_size_write(inode, 0);
 			inode->i_blocks = 0;
 		}
-		if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
+		if ((file->f_flags & O_APPEND) &&
+			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
 			generic_file_llseek(file, 0, SEEK_END);
 	}
 
@@ -87,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		/* enable cached file options */
 		if(file->f_op == &v9fs_file_operations)
 			file->f_op = &v9fs_cached_file_operations;
+		else if (file->f_op == &v9fs_file_operations_dotl)
+			file->f_op = &v9fs_cached_file_operations_dotl;
 
 #ifdef CONFIG_9P_FSCACHE
 		v9fs_cache_inode_set_cookie(inode, file);
@@ -125,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	return res;
 }
 
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct p9_flock flock;
+	struct p9_fid *fid;
+	uint8_t status;
+	int res = 0;
+	unsigned char fl_type;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+		BUG();
+
+	res = posix_lock_file_wait(filp, fl);
+	if (res < 0)
+		goto out;
+
+	/* convert posix lock to p9 tlock args */
+	memset(&flock, 0, sizeof(flock));
+	flock.type = fl->fl_type;
+	flock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		flock.length = 0;
+	else
+		flock.length = fl->fl_end - fl->fl_start + 1;
+	flock.proc_id = fl->fl_pid;
+	flock.client_id = utsname()->nodename;
+	if (IS_SETLKW(cmd))
+		flock.flags = P9_LOCK_FLAGS_BLOCK;
+
+	/*
+	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+	 * for lock request, keep on trying
+	 */
+	for (;;) {
+		res = p9_client_lock_dotl(fid, &flock, &status);
+		if (res < 0)
+			break;
+
+		if (status != P9_LOCK_BLOCKED)
+			break;
+		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+			break;
+		schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+	}
+
+	/* map 9p status to VFS status */
+	switch (status) {
+	case P9_LOCK_SUCCESS:
+		res = 0;
+		break;
+	case P9_LOCK_BLOCKED:
+		res = -EAGAIN;
+		break;
+	case P9_LOCK_ERROR:
+	case P9_LOCK_GRACE:
+		res = -ENOLCK;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * incase server returned error for lock request, revert
+	 * it locally
+	 */
+	if (res < 0 && fl->fl_type != F_UNLCK) {
+		fl_type = fl->fl_type;
+		fl->fl_type = F_UNLCK;
+		res = posix_lock_file_wait(filp, fl);
+		fl->fl_type = fl_type;
+	}
+out:
+	return res;
+}
+
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+	struct p9_getlock glock;
+	struct p9_fid *fid;
+	int res = 0;
+
+	fid = filp->private_data;
+	BUG_ON(fid == NULL);
+
+	posix_test_lock(filp, fl);
+	/*
+	 * if we have a conflicting lock locally, no need to validate
+	 * with server
+	 */
+	if (fl->fl_type != F_UNLCK)
+		return res;
+
+	/* convert posix lock to p9 tgetlock args */
+	memset(&glock, 0, sizeof(glock));
+	glock.type = fl->fl_type;
+	glock.start = fl->fl_start;
+	if (fl->fl_end == OFFSET_MAX)
+		glock.length = 0;
+	else
+		glock.length = fl->fl_end - fl->fl_start + 1;
+	glock.proc_id = fl->fl_pid;
+	glock.client_id = utsname()->nodename;
+
+	res = p9_client_getlock_dotl(fid, &glock);
+	if (res < 0)
+		return res;
+	if (glock.type != F_UNLCK) {
+		fl->fl_type = glock.type;
+		fl->fl_start = glock.start;
+		if (glock.length == 0)
+			fl->fl_end = OFFSET_MAX;
+		else
+			fl->fl_end = glock.start + glock.length - 1;
+		fl->fl_pid = glock.proc_id;
+	} else
+		fl->fl_type = F_UNLCK;
+
+	return res;
+}
+
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else if (IS_GETLK(cmd))
+		ret = v9fs_file_getlock(filp, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+	struct file_lock *fl)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret = -ENOLCK;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+				cmd, fl, filp->f_path.dentry->d_name.name);
+
+	/* No mandatory locks */
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+		goto out_err;
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		goto out_err;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_write_and_wait(inode->i_mapping);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+	/* Convert flock to posix lock */
+	fl->fl_owner = (fl_owner_t)filp;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_flags |= FL_POSIX;
+	fl->fl_flags ^= FL_FLOCK;
+
+	if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+		ret = v9fs_file_do_lock(filp, cmd, fl);
+	else
+		ret = -EINVAL;
+out_err:
+	return ret;
+}
+
 /**
  * v9fs_file_readn - read from a file
  * @filp: file pointer to read
@@ -139,7 +348,7 @@ ssize_t
 v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 	       u64 offset)
 {
-	int n, total;
+	int n, total, size;
 	struct p9_fid *fid = filp->private_data;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +356,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 
 	n = 0;
 	total = 0;
+	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
 	do {
 		n = p9_client_read(fid, data, udata, offset, count);
 		if (n <= 0)
@@ -160,7 +370,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 		offset += n;
 		count -= n;
 		total += n;
-	} while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+	} while (count > 0 && n == size);
 
 	if (n < 0)
 		total = n;
@@ -183,11 +393,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 {
 	int ret;
 	struct p9_fid *fid;
+	size_t size;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
 	fid = filp->private_data;
 
-	if (count > (fid->clnt->msize - P9_IOHDRSZ))
+	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+	if (count > size)
 		ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
 	else
 		ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -211,7 +423,9 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
 		size_t count, loff_t * offset)
 {
-	int n, rsize, total = 0;
+	ssize_t retval;
+	size_t total = 0;
+	int n;
 	struct p9_fid *fid;
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -224,16 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	fid = filp->private_data;
 	clnt = fid->clnt;
 
-	rsize = fid->iounit;
-	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
-		rsize = clnt->msize - P9_IOHDRSZ;
+	retval = generic_write_checks(filp, &origin, &count, 0);
+	if (retval)
+		goto out;
 
-	do {
-		if (count < rsize)
-			rsize = count;
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
 
-		n = p9_client_write(fid, NULL, data+total, origin+total,
-									rsize);
+	do {
+		n = p9_client_write(fid, NULL, data+total, origin+total, count);
 		if (n <= 0)
 			break;
 		count -= n;
@@ -252,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	}
 
 	if (n < 0)
-		return n;
-
-	return total;
+		retval = n;
+	else
+		retval = total;
+out:
+	return retval;
 }
 
 static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -272,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
 	return retval;
 }
 
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+	struct p9_fid *fid;
+	int retval;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+			filp, datasync);
+
+	fid = filp->private_data;
+
+	retval = p9_client_fsync(fid, datasync);
+	return retval;
+}
+
 static const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
@@ -284,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
 	.fsync = v9fs_file_fsync,
 };
 
+static const struct file_operations v9fs_cached_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.aio_read = generic_file_aio_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync_dotl,
+};
+
 const struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_file_read,
@@ -301,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
 	.write = v9fs_file_write,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
-	.lock = v9fs_file_lock,
+	.lock = v9fs_file_lock_dotl,
+	.flock = v9fs_file_flock_dotl,
 	.mmap = generic_file_readonly_mmap,
-	.fsync = v9fs_file_fsync,
+	.fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4331b3b5ee1..34bf71b5654 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,8 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -42,6 +44,8 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "cache.h"
+#include "xattr.h"
+#include "acl.h"
 
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -51,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
 static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		    dev_t rdev);
+
 /**
  * unixmode2p9mode - convert unix mode bits to plan 9
  * @v9ses: v9fs session information
@@ -236,6 +244,41 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 
 /**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+	BUG_ON(dir_inode == NULL);
+
+	if (dir_inode->i_mode & S_ISGID) {
+		/* set_gid bit is set.*/
+		return dir_inode->i_gid;
+	}
+	return current_fsgid();
+}
+
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&dcache_lock);
+	/* Directory should have only one entry. */
+	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	spin_unlock(&dcache_lock);
+	return dentry;
+}
+
+/**
  * v9fs_get_inode - helper function to setup an inode
  * @sb: superblock
  * @mode: mode to setup inode with
@@ -267,7 +310,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	case S_IFBLK:
 	case S_IFCHR:
 	case S_IFSOCK:
-		if (!v9fs_proto_dotu(v9ses)) {
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			inode->i_fop = &v9fs_file_operations_dotl;
+		} else if (v9fs_proto_dotu(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+		} else {
 			P9_DPRINTK(P9_DEBUG_ERROR,
 				   "special files without extended mode\n");
 			err = -EINVAL;
@@ -387,8 +436,10 @@ error:
  * @inode: inode to release
  *
  */
-void v9fs_clear_inode(struct inode *inode)
+void v9fs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(inode->i_mapping, 0);
+	end_writeback(inode);
 	filemap_fdatawrite(inode->i_mapping);
 
 #ifdef CONFIG_9P_FSCACHE
@@ -396,23 +447,14 @@ void v9fs_clear_inode(struct inode *inode)
 #endif
 }
 
-/**
- * v9fs_inode_from_fid - populate an inode by issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-
 static struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct super_block *sb)
 {
 	int err, umode;
-	struct inode *ret;
+	struct inode *ret = NULL;
 	struct p9_wstat *st;
 
-	ret = NULL;
 	st = p9_client_stat(fid);
 	if (IS_ERR(st))
 		return ERR_CAST(st);
@@ -433,15 +475,67 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 #endif
 	p9stat_free(st);
 	kfree(st);
-
 	return ret;
-
 error:
 	p9stat_free(st);
 	kfree(st);
 	return ERR_PTR(err);
 }
 
+static struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+	struct super_block *sb)
+{
+	struct inode *ret = NULL;
+	int err;
+	struct p9_stat_dotl *st;
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	ret = v9fs_get_inode(sb, st->st_mode);
+	if (IS_ERR(ret)) {
+		err = PTR_ERR(ret);
+		goto error;
+	}
+
+	v9fs_stat2inode_dotl(st, ret);
+	ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_vcookie_set_qid(ret, &st->qid);
+	v9fs_cache_inode_get_cookie(ret);
+#endif
+	err = v9fs_get_acl(ret, fid);
+	if (err) {
+		iput(ret);
+		goto error;
+	}
+	kfree(st);
+	return ret;
+error:
+	kfree(st);
+	return ERR_PTR(err);
+}
+
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_dotl(v9ses, fid, sb);
+	else
+		return v9fs_inode(v9ses, fid, sb);
+}
+
 /**
  * v9fs_remove - helper function to remove files and directories
  * @dir: directory inode that is being deleted
@@ -470,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	return retval;
 }
 
-static int
-v9fs_open_created(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
-
 /**
  * v9fs_create - Create a file
  * @v9ses: session information
@@ -563,6 +650,144 @@ error:
 }
 
 /**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		struct nameidata *nd)
+{
+	int err = 0;
+	char *name = NULL;
+	gid_t gid;
+	int flags;
+	mode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL;
+	struct p9_fid *dfid, *ofid;
+	struct file *filp;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct posix_acl *pacl = NULL, *dacl = NULL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else {
+		/*
+		 * create call without LOOKUP_OPEN is due
+		 * to mknod of regular files. So use mknod
+		 * operation.
+		 */
+		return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	}
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+			"mode:0x%x\n", name, flags, omode);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	/* clone a fid to use for creation */
+	ofid = p9_client_walk(dfid, 0, NULL, 1);
+	if (IS_ERR(ofid)) {
+		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in creat %d\n", err);
+		goto error;
+	}
+	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+				"p9_client_open_dotl failed in creat %d\n",
+				err);
+		goto error;
+	}
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
+	    (nd && nd->flags & LOOKUP_OPEN)) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		/* The fid would get clunked via a dput */
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		dentry->d_op = &v9fs_dentry_operations;
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+
+	/* if we are opening a file, assign the open fid to the file */
+	if (nd && nd->flags & LOOKUP_OPEN) {
+		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
+		if (IS_ERR(filp)) {
+			p9_client_clunk(ofid);
+			return PTR_ERR(filp);
+		}
+		filp->private_data = ofid;
+	} else
+		p9_client_clunk(ofid);
+
+	return 0;
+
+error:
+	if (ofid)
+		p9_client_clunk(ofid);
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
+/**
  * v9fs_vfs_create - VFS hook to create files
  * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
@@ -602,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
-		filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
 			goto error;
@@ -652,6 +877,107 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return err;
 }
 
+
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
+			       struct dentry *dentry, int omode)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	gid_t gid;
+	char *name;
+	mode_t mode;
+	struct inode *inode;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+	err = 0;
+	v9ses = v9fs_inode2v9ses(dir);
+
+	omode |= S_IFDIR;
+	if (dir->i_mode & S_ISGID)
+		omode |= S_ISGID;
+
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mkdir %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+	err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate
+		 * inode with stat. We need to get an inode
+		 * so that we can set the acl with dentry
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		dentry->d_op = &v9fs_dentry_operations;
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
 /**
  * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
  * @dir:  inode that is being walked from
@@ -678,6 +1004,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
+	/* We can walk d_parent because we hold the dir->i_mutex */
 	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid))
 		return ERR_CAST(dfid);
@@ -703,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	result = v9fs_fid_add(dentry, fid);
 	if (result < 0)
-		goto error;
+		goto error_iput;
 
 inst_out:
 	if (v9ses->cache)
@@ -714,6 +1041,8 @@ inst_out:
 	d_add(dentry, inode);
 	return NULL;
 
+error_iput:
+	iput(inode);
 error:
 	p9_client_clunk(fid);
 
@@ -785,27 +1114,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto clunk_olddir;
 	}
 
+	down_write(&v9ses->rename_sem);
 	if (v9fs_proto_dotl(v9ses)) {
 		retval = p9_client_rename(oldfid, newdirfid,
 					(char *) new_dentry->d_name.name);
 		if (retval != -ENOSYS)
 			goto clunk_newdir;
 	}
+	if (old_dentry->d_parent != new_dentry->d_parent) {
+		/*
+		 * 9P .u can only handle file rename in the same directory
+		 */
 
-	/* 9P can only handle file rename in the same directory */
-	if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
 		P9_DPRINTK(P9_DEBUG_ERROR,
 				"old dir and new dir are different\n");
 		retval = -EXDEV;
 		goto clunk_newdir;
 	}
-
 	v9fs_blank_wstat(&wstat);
 	wstat.muid = v9ses->uname;
 	wstat.name = (char *) new_dentry->d_name.name;
 	retval = p9_client_wstat(oldfid, &wstat);
 
 clunk_newdir:
+	if (!retval)
+		/* successful rename */
+		d_move(old_dentry, new_dentry);
+	up_write(&v9ses->rename_sem);
 	p9_client_clunk(newdirfid);
 
 clunk_olddir:
@@ -849,6 +1184,43 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
 		generic_fillattr(dentry->d_inode, stat);
 
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	int err;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_stat_dotl *st;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+	err = -EPERM;
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+		return simple_getattr(mnt, dentry, stat);
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	/* Ask for all the fields in stat structure. Server will return
+	 * whatever it supports
+	 */
+
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	v9fs_stat2inode_dotl(st, dentry->d_inode);
+	generic_fillattr(dentry->d_inode, stat);
+	/* Change block size to what the server returned */
+	stat->blksize = st->st_blksize;
+
 	kfree(st);
 	return 0;
 }
@@ -896,10 +1268,77 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 
 	retval = p9_client_wstat(fid, &wstat);
-	if (retval >= 0)
-		retval = inode_setattr(dentry->d_inode, iattr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode)) {
+		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+		if (retval)
+			return retval;
+	}
 
-	return retval;
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	return 0;
+}
+
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+	int retval;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+	retval = inode_change_ok(dentry->d_inode, iattr);
+	if (retval)
+		return retval;
+
+	p9attr.valid = iattr->ia_valid;
+	p9attr.mode = iattr->ia_mode;
+	p9attr.uid = iattr->ia_uid;
+	p9attr.gid = iattr->ia_gid;
+	p9attr.size = iattr->ia_size;
+	p9attr.atime_sec = iattr->ia_atime.tv_sec;
+	p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+	p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+
+	retval = -EPERM;
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	retval = p9_client_setattr(fid, &p9attr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode)) {
+		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+		if (retval)
+			return retval;
+	}
+
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	if (iattr->ia_valid & ATTR_MODE) {
+		/* We also want to update ACL when we update mode bits */
+		retval = v9fs_acl_chmod(dentry);
+		if (retval < 0)
+			return retval;
+	}
+	return 0;
 }
 
 /**
@@ -980,6 +1419,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 
 /**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+
+	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+		inode->i_atime.tv_sec = stat->st_atime_sec;
+		inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		inode->i_mtime.tv_sec = stat->st_mtime_sec;
+		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode->i_ctime.tv_sec = stat->st_ctime_sec;
+		inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		inode->i_uid = stat->st_uid;
+		inode->i_gid = stat->st_gid;
+		inode->i_nlink = stat->st_nlink;
+		inode->i_mode = stat->st_mode;
+		inode->i_rdev = new_decode_dev(stat->st_rdev);
+
+		if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+			init_special_inode(inode, inode->i_mode, inode->i_rdev);
+
+		i_size_write(inode, stat->st_size);
+		inode->i_blocks = stat->st_blocks;
+	} else {
+		if (stat->st_result_mask & P9_STATS_ATIME) {
+			inode->i_atime.tv_sec = stat->st_atime_sec;
+			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_MTIME) {
+			inode->i_mtime.tv_sec = stat->st_mtime_sec;
+			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_CTIME) {
+			inode->i_ctime.tv_sec = stat->st_ctime_sec;
+			inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+		}
+		if (stat->st_result_mask & P9_STATS_UID)
+			inode->i_uid = stat->st_uid;
+		if (stat->st_result_mask & P9_STATS_GID)
+			inode->i_gid = stat->st_gid;
+		if (stat->st_result_mask & P9_STATS_NLINK)
+			inode->i_nlink = stat->st_nlink;
+		if (stat->st_result_mask & P9_STATS_MODE) {
+			inode->i_mode = stat->st_mode;
+			if ((S_ISBLK(inode->i_mode)) ||
+						(S_ISCHR(inode->i_mode)))
+				init_special_inode(inode, inode->i_mode,
+								inode->i_rdev);
+		}
+		if (stat->st_result_mask & P9_STATS_RDEV)
+			inode->i_rdev = new_decode_dev(stat->st_rdev);
+		if (stat->st_result_mask & P9_STATS_SIZE)
+			i_size_write(inode, stat->st_size);
+		if (stat->st_result_mask & P9_STATS_BLOCKS)
+			inode->i_blocks = stat->st_blocks;
+	}
+	if (stat->st_result_mask & P9_STATS_GEN)
+			inode->i_generation = stat->st_gen;
+
+	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+	 * because the inode structure does not have fields for them.
+	 */
+}
+
+/**
  * v9fs_qid2ino - convert qid into inode number
  * @qid: qid to hash
  *
@@ -1042,6 +1552,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 
 	retval = strnlen(buffer, buflen);
 done:
+	p9stat_free(st);
 	kfree(st);
 	return retval;
 }
@@ -1128,6 +1639,94 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 
 /**
+ * v9fs_vfs_symlink_dotl - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.L RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+		const char *symname)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *dfid;
+	struct p9_fid *fid = NULL;
+	struct inode *inode;
+	struct p9_qid qid;
+	char *name;
+	int err;
+	gid_t gid;
+
+	name = (char *) dentry->d_name.name;
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+			dir->i_ino, name, symname);
+	v9ses = v9fs_inode2v9ses(dir);
+
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return err;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+
+	/* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+	err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+		goto error;
+	}
+
+	if (v9ses->cache) {
+		/* Now walk from the parent so we can get an unopened fid. */
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+					err);
+			fid = NULL;
+			goto error;
+		}
+
+		/* instantiate inode and assign the unopened fid to dentry */
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+					err);
+			goto error;
+		}
+		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/* Not in cached mode. No need to populate inode with stat */
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		dentry->d_op = &v9fs_dentry_operations;
+		d_instantiate(dentry, inode);
+	}
+
+error:
+	if (fid)
+		p9_client_clunk(fid);
+
+	return err;
+}
+
+/**
  * v9fs_vfs_symlink - helper function to create symlinks
  * @dir: directory inode containing symlink
  * @dentry: dentry for symlink
@@ -1186,6 +1785,77 @@ clunk_fid:
 }
 
 /**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	int err;
+	struct p9_fid *dfid, *oldfid;
+	char *name;
+	struct v9fs_session_info *v9ses;
+	struct dentry *dir_dentry;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+			dir->i_ino, old_dentry->d_name.name,
+			dentry->d_name.name);
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid))
+		return PTR_ERR(dfid);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (IS_ERR(oldfid))
+		return PTR_ERR(oldfid);
+
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+		return err;
+	}
+
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		/* Get the latest stat info from server. */
+		struct p9_fid *fid;
+		struct p9_stat_dotl *st;
+
+		fid = v9fs_fid_lookup(old_dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+		if (IS_ERR(st))
+			return PTR_ERR(st);
+
+		v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+
+		kfree(st);
+	} else {
+		/* Caching disabled. No need to get upto date stat info.
+		 * This dentry will be released immediately. So, just hold the
+		 * inode
+		 */
+		ihold(old_dentry->d_inode);
+	}
+
+	dentry->d_op = old_dentry->d_op;
+	d_instantiate(dentry, old_dentry->d_inode);
+
+	return err;
+}
+
+/**
  * v9fs_vfs_mknod - create a special file
  * @dir: inode destination for new link
  * @dentry: dentry for file
@@ -1230,6 +1900,160 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+		dev_t rdev)
+{
+	int err;
+	char *name;
+	mode_t mode;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid = NULL, *dfid = NULL;
+	struct inode *inode;
+	gid_t gid;
+	struct p9_qid qid;
+	struct dentry *dir_dentry;
+	struct posix_acl *dacl = NULL, *pacl = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS,
+		" %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	v9ses = v9fs_inode2v9ses(dir);
+	dir_dentry = v9fs_dentry_from_dir_inode(dir);
+	dfid = v9fs_fid_lookup(dir_dentry);
+	if (IS_ERR(dfid)) {
+		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		dfid = NULL;
+		goto error;
+	}
+
+	gid = v9fs_get_fsgid_for_create(dir);
+	mode = omode;
+	/* Update mode based on ACL value */
+	err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			   "Failed to get acl values in mknod %d\n", err);
+		goto error;
+	}
+	name = (char *) dentry->d_name.name;
+
+	err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+	if (err < 0)
+		goto error;
+
+	/* instantiate inode and assign the unopened fid to the dentry */
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		fid = p9_client_walk(dfid, 1, &name, 1);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+				err);
+			fid = NULL;
+			goto error;
+		}
+
+		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+				err);
+			goto error;
+		}
+		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_instantiate(dentry, inode);
+		err = v9fs_fid_add(dentry, fid);
+		if (err < 0)
+			goto error;
+		fid = NULL;
+	} else {
+		/*
+		 * Not in cached mode. No need to populate inode with stat.
+		 * socket syscall returns a fd, so we need instantiate
+		 */
+		inode = v9fs_get_inode(dir->i_sb, mode);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			goto error;
+		}
+		dentry->d_op = &v9fs_dentry_operations;
+		d_instantiate(dentry, inode);
+	}
+	/* Now set the ACL based on the default value */
+	v9fs_set_create_acl(dentry, dacl, pacl);
+error:
+	if (fid)
+		p9_client_clunk(fid);
+	return err;
+}
+
+static int
+v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval;
+	struct p9_fid *fid;
+	char *target = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+	retval = -EPERM;
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	retval = p9_client_readlink(fid, &target);
+	if (retval < 0)
+		return retval;
+
+	strncpy(buffer, target, buflen);
+	P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
+
+	retval = strnlen(buffer, buflen);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
+		if (len < 0) {
+			__putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[min(len, PATH_MAX-1)] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
@@ -1245,17 +2069,22 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 };
 
 static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-	.create = v9fs_vfs_create,
+	.create = v9fs_vfs_create_dotl,
 	.lookup = v9fs_vfs_lookup,
-	.symlink = v9fs_vfs_symlink,
-	.link = v9fs_vfs_link,
+	.link = v9fs_vfs_link_dotl,
+	.symlink = v9fs_vfs_symlink_dotl,
 	.unlink = v9fs_vfs_unlink,
-	.mkdir = v9fs_vfs_mkdir,
+	.mkdir = v9fs_vfs_mkdir_dotl,
 	.rmdir = v9fs_vfs_rmdir,
-	.mknod = v9fs_vfs_mknod,
+	.mknod = v9fs_vfs_mknod_dotl,
 	.rename = v9fs_vfs_rename,
-	.getattr = v9fs_vfs_getattr,
-	.setattr = v9fs_vfs_setattr,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
 };
 
 static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1276,8 +2105,13 @@ static const struct inode_operations v9fs_file_inode_operations = {
 };
 
 static const struct inode_operations v9fs_file_inode_operations_dotl = {
-	.getattr = v9fs_vfs_getattr,
-	.setattr = v9fs_vfs_setattr,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
+	.check_acl = v9fs_check_acl,
 };
 
 static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -1289,9 +2123,13 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 };
 
 static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link,
+	.readlink = v9fs_vfs_readlink_dotl,
+	.follow_link = v9fs_vfs_follow_link_dotl,
 	.put_link = v9fs_vfs_put_link,
-	.getattr = v9fs_vfs_getattr,
-	.setattr = v9fs_vfs_setattr,
+	.getattr = v9fs_vfs_getattr_dotl,
+	.setattr = v9fs_vfs_setattr_dotl,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = v9fs_listxattr,
 };
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index be74d020436..c55c614500a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,12 +39,15 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "fid.h"
+#include "xattr.h"
+#include "acl.h"
 
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
@@ -65,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
  * v9fs_fill_super - populate superblock with info
  * @sb: superblock
  * @v9ses: session information
- * @flags: flags propagated from v9fs_get_sb()
+ * @flags: flags propagated from v9fs_mount()
  *
  */
 
@@ -77,37 +80,40 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
 	sb->s_blocksize = 1 << sb->s_blocksize_bits;
 	sb->s_magic = V9FS_MAGIC;
-	if (v9fs_proto_dotl(v9ses))
+	if (v9fs_proto_dotl(v9ses)) {
 		sb->s_op = &v9fs_super_ops_dotl;
-	else
+		sb->s_xattr = v9fs_xattr_handlers;
+	} else
 		sb->s_op = &v9fs_super_ops;
 	sb->s_bdi = &v9ses->bdi;
 
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
 	    MS_NOATIME;
 
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+		sb->s_flags |= MS_POSIXACL;
+#endif
+
 	save_mount_options(sb, data);
 }
 
 /**
- * v9fs_get_sb - mount a superblock
+ * v9fs_mount - mount a superblock
  * @fs_type: file system type
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
- * @mnt: mountpoint record to be instantiated
  *
  */
 
-static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data,
-		       struct vfsmount *mnt)
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
 	struct super_block *sb = NULL;
 	struct inode *inode = NULL;
 	struct dentry *root = NULL;
 	struct v9fs_session_info *v9ses = NULL;
-	struct p9_wstat *st = NULL;
 	int mode = S_IRWXUGO | S_ISVTX;
 	struct p9_fid *fid;
 	int retval = 0;
@@ -116,24 +122,22 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	fid = v9fs_session_init(v9ses, dev_name, data);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
+		/*
+		 * we need to call session_close to tear down some
+		 * of the data structure setup by session_init
+		 */
 		goto close_session;
 	}
 
-	st = p9_client_stat(fid);
-	if (IS_ERR(st)) {
-		retval = PTR_ERR(st);
-		goto clunk_fid;
-	}
-
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
-		goto free_stat;
+		goto clunk_fid;
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
@@ -149,37 +153,56 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 		retval = -ENOMEM;
 		goto release_sb;
 	}
-
 	sb->s_root = root;
-	root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+	if (v9fs_proto_dotl(v9ses)) {
+		struct p9_stat_dotl *st = NULL;
+		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
 
-	v9fs_stat2inode(st, root->d_inode, sb);
+		v9fs_stat2inode_dotl(st, root->d_inode);
+		kfree(st);
+	} else {
+		struct p9_wstat *st = NULL;
+		st = p9_client_stat(fid);
+		if (IS_ERR(st)) {
+			retval = PTR_ERR(st);
+			goto release_sb;
+		}
 
-	v9fs_fid_add(root, fid);
-	p9stat_free(st);
-	kfree(st);
+		root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+		v9fs_stat2inode(st, root->d_inode, sb);
 
-P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-	simple_set_mnt(mnt, sb);
-	return 0;
+		p9stat_free(st);
+		kfree(st);
+	}
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto release_sb;
+	v9fs_fid_add(root, fid);
 
-free_stat:
-	p9stat_free(st);
-	kfree(st);
+	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	return dget(sb->s_root);
 
 clunk_fid:
 	p9_client_clunk(fid);
-
 close_session:
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
-	return retval;
+	return ERR_PTR(retval);
 
 release_sb:
-	p9stat_free(st);
-	kfree(st);
+	/*
+	 * we will do the session_close and root dentry release
+	 * in the below call. But we need to clunk fid, because we haven't
+	 * attached the fid to dentry so it won't get clunked
+	 * automatically.
+	 */
+	p9_client_clunk(fid);
 	deactivate_locked_super(sb);
-	return retval;
+	return ERR_PTR(retval);
 }
 
 /**
@@ -232,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (v9fs_proto_dotl(v9ses)) {
 		res = p9_client_statfs(fid, &rs);
 		if (res == 0) {
-			buf->f_type = rs.type;
+			buf->f_type = V9FS_MAGIC;
 			buf->f_bsize = rs.bsize;
 			buf->f_blocks = rs.blocks;
 			buf->f_bfree = rs.bfree;
@@ -257,7 +280,7 @@ static const struct super_operations v9fs_super_ops = {
 	.destroy_inode = v9fs_destroy_inode,
 #endif
 	.statfs = simple_statfs,
-	.clear_inode = v9fs_clear_inode,
+	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
 };
@@ -268,14 +291,15 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.destroy_inode = v9fs_destroy_inode,
 #endif
 	.statfs = v9fs_statfs,
-	.clear_inode = v9fs_clear_inode,
+	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
 };
 
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
-	.get_sb = v9fs_get_sb,
+	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
+	.fs_flags = FS_RENAME_DOES_D_MOVE,
 };
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 00000000000..43ec7df8433
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "fid.h"
+#include "xattr.h"
+
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+			   void *buffer, size_t buffer_size)
+{
+	ssize_t retval;
+	int msize, read_count;
+	u64 offset = 0, attr_size;
+	struct p9_fid *attr_fid;
+
+	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+	if (IS_ERR(attr_fid)) {
+		retval = PTR_ERR(attr_fid);
+		P9_DPRINTK(P9_DEBUG_VFS,
+			"p9_client_attrwalk failed %zd\n", retval);
+		attr_fid = NULL;
+		goto error;
+	}
+	if (!buffer_size) {
+		/* request to get the attr_size */
+		retval = attr_size;
+		goto error;
+	}
+	if (attr_size > buffer_size) {
+		retval = -ERANGE;
+		goto error;
+	}
+	msize = attr_fid->clnt->msize;
+	while (attr_size) {
+		if (attr_size > (msize - P9_IOHDRSZ))
+			read_count = msize - P9_IOHDRSZ;
+		else
+			read_count = attr_size;
+		read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
+					NULL, offset, read_count);
+		if (read_count < 0) {
+			/* error in xattr read */
+			retval = read_count;
+			goto error;
+		}
+		offset += read_count;
+		attr_size -= read_count;
+	}
+	/* Total read xattr bytes */
+	retval = offset;
+error:
+	if (attr_fid)
+		p9_client_clunk(attr_fid);
+	return retval;
+
+}
+
+
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t buffer_size)
+{
+	struct p9_fid *fid;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+		__func__, name, buffer_size);
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
+
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
+	u64 offset = 0;
+	int retval, msize, write_count;
+	struct p9_fid *fid = NULL;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+		__func__, name, value_len, flags);
+
+	fid = v9fs_fid_clone(dentry);
+	if (IS_ERR(fid)) {
+		retval = PTR_ERR(fid);
+		fid = NULL;
+		goto error;
+	}
+	/*
+	 * On success fid points to xattr
+	 */
+	retval = p9_client_xattrcreate(fid, name, value_len, flags);
+	if (retval < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS,
+			"p9_client_xattrcreate failed %d\n", retval);
+		goto error;
+	}
+	msize = fid->clnt->msize;;
+	while (value_len) {
+		if (value_len > (msize - P9_IOHDRSZ))
+			write_count = msize - P9_IOHDRSZ;
+		else
+			write_count = value_len;
+		write_count = p9_client_write(fid, ((char *)value)+offset,
+					NULL, offset, write_count);
+		if (write_count < 0) {
+			/* error in xattr write */
+			retval = write_count;
+			goto error;
+		}
+		offset += write_count;
+		value_len -= write_count;
+	}
+	/* Total read xattr bytes */
+	retval = offset;
+error:
+	if (fid)
+		retval = p9_client_clunk(fid);
+	return retval;
+}
+
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+	&v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+	&v9fs_xattr_acl_access_handler,
+	&v9fs_xattr_acl_default_handler,
+#endif
+	NULL
+};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 00000000000..eaa837c53bd
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+
+#include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+				  void *, size_t);
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+			      void *, size_t);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+			  const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 00000000000..d0b701b7208
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+			void *buffer, size_t size, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name+prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+	kfree(full_name);
+	return retval;
+}
+
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	int retval;
+	char *full_name;
+	size_t name_len;
+	size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+	if (!full_name)
+		return -ENOMEM;
+	memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+	memcpy(full_name + prefix_len, name, name_len);
+	full_name[prefix_len + name_len] = '\0';
+
+	retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+	kfree(full_name);
+	return retval;
+}
+
+struct xattr_handler v9fs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= v9fs_xattr_user_get,
+	.set	= v9fs_xattr_user_set,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f85b594761..771f457402d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
+config EXPORTFS
+	tristate
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -59,12 +62,11 @@ source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
 
-source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
 config CUSE
-	tristate "Character device in Userpace support"
+	tristate "Character device in Userspace support"
 	depends on FUSE_FS
 	help
 	  This FUSE extension allows character devices to be
@@ -221,9 +223,6 @@ config LOCKD_V4
 	depends on FILE_LOCKING
 	default y
 
-config EXPORTFS
-	tristate
-
 config NFS_ACL_SUPPORT
 	tristate
 	select FS_POSIX_ACL
@@ -234,7 +233,6 @@ config NFS_COMMON
 	default y
 
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc..79e2ca7973b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 
 config CORE_DUMP_DEFAULT_ELF_HEADERS
 	bool "Write ELF core dumps with partial segments"
-	default n
+	default y
 	depends on BINFMT_ELF && ELF_CORE
 	help
 	  ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
 	  inherited.  See Documentation/filesystems/proc.txt for details.
 
 	  This config option changes the default setting of coredump_filter
-	  seen at boot time.  If unsure, say N.
+	  seen at boot time.  If unsure, say Y.
 
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1..a7f7cef0c0c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
-
-nfsd-$(CONFIG_NFSD)		:= nfsctl.o
-obj-y				+= $(nfsd-y) $(nfsd-m)
-
+obj-$(CONFIG_NFSD_DEPRECATED)	+= nfsctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)		+= lockd/
 obj-$(CONFIG_NLS)		+= nls/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
-obj-$(CONFIG_SMB_FS)		+= smbfs/
 obj-$(CONFIG_CIFS)		+= cifs/
 obj-$(CONFIG_NCP_FS)		+= ncpfs/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
-obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a7460..1dd5f34b3cf 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
 	depends on BLOCK && EXPERIMENTAL
+	depends on BKL # need to fix
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 6f850b06ab6..65794b8fe79 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,10 +50,19 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				adfs_get_block,
 				&ADFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
@@ -324,10 +333,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	/* XXX: this is missing some actual on-disk truncation.. */
 	if (ia_valid & ATTR_SIZE)
-		error = simple_setsize(inode, attr->ia_size);
-
-	if (error)
-		goto out;
+		truncate_setsize(inode, attr->ia_size);
 
 	if (ia_valid & ATTR_MTIME) {
 		inode->i_mtime = attr->ia_mtime;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1..959dbff2d42 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct adfs_sb_info *asb;
 	struct inode *root;
 
+	lock_kernel();
+
 	sb->s_flags |= MS_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-	if (!asb)
+	if (!asb) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	sb->s_fs_info = asb;
 
 	/* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto error;
 	} else
 		sb->s_root->d_op = &adfs_dentry_operations;
+	unlock_kernel();
 	return 0;
 
 error_free_bh:
@@ -481,20 +486,20 @@ error_free_bh:
 error:
 	sb->s_fs_info = NULL;
 	kfree(asb);
+	unlock_kernel();
 	return -EINVAL;
 }
 
-static int adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
 }
 
 static struct file_system_type adfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "adfs",
-	.get_sb		= adfs_get_sb,
+	.mount		= adfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index f05b6155ccc..a8cbdeb3402 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -171,8 +171,7 @@ extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 extern unsigned long		 affs_parent_ino(struct inode *dir);
 extern struct inode		*affs_new_inode(struct inode *dir);
 extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
-extern void			 affs_delete_inode(struct inode *inode);
-extern void			 affs_clear_inode(struct inode *inode);
+extern void			 affs_evict_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
 					unsigned long ino);
 extern int			 affs_write_inode(struct inode *inode,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 322710c3eed..0a90dcd46de 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,10 +406,19 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				affs_get_block,
 				&AFFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
@@ -885,9 +894,9 @@ affs_truncate(struct inode *inode)
 		if (AFFS_SB(sb)->s_flags & SF_OFS) {
 			struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
 			u32 tmp;
-			if (IS_ERR(ext_bh)) {
+			if (IS_ERR(bh)) {
 				affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-					     ext, PTR_ERR(ext_bh));
+					     ext, PTR_ERR(bh));
 				return;
 			}
 			tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index f4b2a4ee4f9..5d828903ac6 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -235,31 +235,36 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
-	error = inode_setattr(inode, attr);
-	if (!error && (attr->ia_valid & ATTR_MODE))
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE)
 		mode_to_prot(inode);
 out:
 	return error;
 }
 
 void
-affs_delete_inode(struct inode *inode)
-{
-	pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-	truncate_inode_pages(&inode->i_data, 0);
-	inode->i_size = 0;
-	affs_truncate(inode);
-	clear_inode(inode);
-	affs_free_block(inode->i_sb, inode->i_ino);
-}
-
-void
-affs_clear_inode(struct inode *inode)
+affs_evict_inode(struct inode *inode)
 {
 	unsigned long cache_page;
+	pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+	truncate_inode_pages(&inode->i_data, 0);
 
-	pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		affs_truncate(inode);
+	}
 
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 	affs_free_prealloc(inode);
 	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
 	if (cache_page) {
@@ -271,6 +276,9 @@ affs_clear_inode(struct inode *inode)
 	affs_brelse(AFFS_I(inode)->i_ext_bh);
 	AFFS_I(inode)->i_ext_last = ~1;
 	AFFS_I(inode)->i_ext_bh = NULL;
+
+	if (!inode->i_nlink)
+		affs_free_block(inode->i_sb, inode->i_ino);
 }
 
 struct inode *
@@ -380,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
 		mark_buffer_dirty_inode(inode_bh, inode);
 		inode->i_nlink = 2;
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
 	mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 16a3e4765f6..0cf7f4384cb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "affs.h"
 
@@ -26,7 +25,7 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
-affs_commit_super(struct super_block *sb, int clean)
+affs_commit_super(struct super_block *sb, int wait, int clean)
 {
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	struct buffer_head *bh = sbi->s_root_bh;
@@ -36,6 +35,8 @@ affs_commit_super(struct super_block *sb, int clean)
 	secs_to_datestamp(get_seconds(), &tail->disk_change);
 	affs_fix_checksum(sb, bh);
 	mark_buffer_dirty(bh);
+	if (wait)
+		sync_dirty_buffer(bh);
 }
 
 static void
@@ -44,44 +45,33 @@ affs_put_super(struct super_block *sb)
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	pr_debug("AFFS: put_super()\n");
 
-	lock_kernel();
-
-	if (!(sb->s_flags & MS_RDONLY))
-		affs_commit_super(sb, 1);
+	if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
+		affs_commit_super(sb, 1, 1);
 
 	kfree(sbi->s_prefix);
 	affs_free_bitmap(sb);
 	affs_brelse(sbi->s_root_bh);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static void
 affs_write_super(struct super_block *sb)
 {
-	int clean = 2;
-
 	lock_super(sb);
-	if (!(sb->s_flags & MS_RDONLY)) {
-		//	if (sbi->s_bitmap[i].bm_bh) {
-		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
-		//			clean = 0;
-		affs_commit_super(sb, clean);
-		sb->s_dirt = !clean;	/* redo until bitmap synced */
-	} else
-		sb->s_dirt = 0;
+	if (!(sb->s_flags & MS_RDONLY))
+		affs_commit_super(sb, 1, 2);
+	sb->s_dirt = 0;
 	unlock_super(sb);
 
-	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
+	pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
 }
 
 static int
 affs_sync_fs(struct super_block *sb, int wait)
 {
 	lock_super(sb);
-	affs_commit_super(sb, 2);
+	affs_commit_super(sb, wait, 2);
 	sb->s_dirt = 0;
 	unlock_super(sb);
 	return 0;
@@ -114,8 +104,8 @@ static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
-	init_MUTEX(&ei->i_link_lock);
-	init_MUTEX(&ei->i_ext_lock);
+	sema_init(&ei->i_link_lock, 1);
+	sema_init(&ei->i_ext_lock, 1);
 	inode_init_once(&ei->vfs_inode);
 }
 
@@ -140,8 +130,7 @@ static const struct super_operations affs_sops = {
 	.alloc_inode	= affs_alloc_inode,
 	.destroy_inode	= affs_destroy_inode,
 	.write_inode	= affs_write_inode,
-	.delete_inode	= affs_delete_inode,
-	.clear_inode	= affs_clear_inode,
+	.evict_inode	= affs_evict_inode,
 	.put_super	= affs_put_super,
 	.write_super	= affs_write_super,
 	.sync_fs	= affs_sync_fs,
@@ -308,6 +297,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->s_bmlock);
 	spin_lock_init(&sbi->symlink_lock);
@@ -533,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
-	lock_kernel();
+
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -549,19 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	memcpy(sbi->s_volume, volume, 32);
 	spin_unlock(&sbi->symlink_lock);
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
-	}
+
 	if (*flags & MS_RDONLY) {
-		sb->s_dirt = 1;
-		while (sb->s_dirt)
-			affs_write_super(sb);
+		affs_write_super(sb);
 		affs_free_bitmap(sb);
 	} else
 		res = affs_init_bitmap(sb, flags);
 
-	unlock_kernel();
 	return res;
 }
 
@@ -587,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *affs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
 }
 
 static struct file_system_type affs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "affs",
-	.get_sb		= affs_get_sb,
+	.mount		= affs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c77..8f975f25b48 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
 	select AF_RXRPC
+	select DNS_RESOLVER
 	help
 	  If you say Y here, you will get an experimental Andrew File System
 	  driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059e..0d5eeadf612 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
+#include <linux/dns_resolver.h>
 #include <linux/sched.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
@@ -30,21 +31,24 @@ static struct afs_cell *afs_cell_root;
  * allocate a cell record and fill in its name, VL server address list and
  * allocate an anonymous key
  */
-static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+				       char *vllist)
 {
 	struct afs_cell *cell;
 	struct key *key;
-	size_t namelen;
 	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
+	char  *dvllist = NULL, *_vllist = NULL;
+	char  delimiter = ':';
 	int ret;
 
-	_enter("%s,%s", name, vllist);
+	_enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
 
 	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
 
-	namelen = strlen(name);
-	if (namelen > AFS_MAXCELLNAME)
+	if (namelen > AFS_MAXCELLNAME) {
+		_leave(" = -ENAMETOOLONG");
 		return ERR_PTR(-ENAMETOOLONG);
+	}
 
 	/* allocate and initialise a cell record */
 	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +68,35 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
 	INIT_LIST_HEAD(&cell->vl_list);
 	spin_lock_init(&cell->vl_lock);
 
+	/* if the ip address is invalid, try dns query */
+	if (!vllist || strlen(vllist) < 7) {
+		ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
+		if (ret < 0) {
+			if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+				/* translate these errors into something
+				 * userspace might understand */
+				ret = -EDESTADDRREQ;
+			_leave(" = %d", ret);
+			return ERR_PTR(ret);
+		}
+		_vllist = dvllist;
+
+		/* change the delimiter for user-space reply */
+		delimiter = ',';
+
+	} else {
+		_vllist = vllist;
+	}
+
 	/* fill in the VL server list from the rest of the string */
 	do {
 		unsigned a, b, c, d;
 
-		next = strchr(vllist, ':');
+		next = strchr(_vllist, delimiter);
 		if (next)
 			*next++ = 0;
 
-		if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
+		if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
 			goto bad_address;
 
 		if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +105,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
 		cell->vl_addrs[cell->vl_naddrs++].s_addr =
 			htonl((a << 24) | (b << 16) | (c << 8) | d);
 
-	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
+	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
 
 	/* create a key to represent an anonymous user */
 	memcpy(keyname, "afs@", 4);
@@ -110,32 +134,36 @@ bad_address:
 	ret = -EINVAL;
 error:
 	key_put(cell->anonymous_key);
+	kfree(dvllist);
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 /*
- * create a cell record
- * - "name" is the name of the cell
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * afs_cell_crate() - create a cell record
+ * @name:	is the name of the cell.
+ * @namsesz:	is the strlen of the cell name.
+ * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref:	is T to return the cell reference when the cell exists.
  */
-struct afs_cell *afs_cell_create(const char *name, char *vllist)
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+				 char *vllist, bool retref)
 {
 	struct afs_cell *cell;
 	int ret;
 
-	_enter("%s,%s", name, vllist);
+	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
 
 	down_write(&afs_cells_sem);
 	read_lock(&afs_cells_lock);
 	list_for_each_entry(cell, &afs_cells, link) {
-		if (strcasecmp(cell->name, name) == 0)
+		if (strncasecmp(cell->name, name, namesz) == 0)
 			goto duplicate_name;
 	}
 	read_unlock(&afs_cells_lock);
 
-	cell = afs_cell_alloc(name, vllist);
+	cell = afs_cell_alloc(name, namesz, vllist);
 	if (IS_ERR(cell)) {
 		_leave(" = %ld", PTR_ERR(cell));
 		up_write(&afs_cells_sem);
@@ -175,8 +203,18 @@ error:
 	return ERR_PTR(ret);
 
 duplicate_name:
+	if (retref && !IS_ERR(cell))
+		afs_get_cell(cell);
+
 	read_unlock(&afs_cells_lock);
 	up_write(&afs_cells_sem);
+
+	if (retref) {
+		_leave(" = %p", cell);
+		return cell;
+	}
+
+	_leave(" = -EEXIST");
 	return ERR_PTR(-EEXIST);
 }
 
@@ -201,15 +239,13 @@ int afs_cell_init(char *rootcell)
 	}
 
 	cp = strchr(rootcell, ':');
-	if (!cp) {
-		printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
-		_leave(" = -EINVAL");
-		return -EINVAL;
-	}
+	if (!cp)
+		_debug("kAFS: no VL server IP addresses specified");
+	else
+		*cp++ = 0;
 
 	/* allocate a cell record for the root cell */
-	*cp++ = 0;
-	new_root = afs_cell_create(rootcell, cp);
+	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
 	if (IS_ERR(new_root)) {
 		_leave(" = %ld", PTR_ERR(new_root));
 		return PTR_ERR(new_root);
@@ -229,11 +265,12 @@ int afs_cell_init(char *rootcell)
 /*
  * lookup a cell record
  */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+				 bool dns_cell)
 {
 	struct afs_cell *cell;
 
-	_enter("\"%*.*s\",", namesz, namesz, name ? name : "");
+	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
 
 	down_read(&afs_cells_sem);
 	read_lock(&afs_cells_lock);
@@ -247,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
 			}
 		}
 		cell = ERR_PTR(-ENOENT);
+		if (dns_cell)
+			goto create_cell;
 	found:
 		;
 	} else {
@@ -269,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
 	up_read(&afs_cells_sem);
 	_leave(" = %p", cell);
 	return cell;
+
+create_cell:
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+
+	cell = afs_cell_create(name, namesz, NULL, true);
+
+	_leave(" = %p", cell);
+	return cell;
 }
 
 #if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b42d5cc1d6d..5439e1bc9a8 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -477,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 }
 
 /*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+	int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+	struct afs_fid *fid)
+{
+	const char *devname = dentry->d_name.name;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	struct inode *inode;
+
+	_enter("%d, %p{%s}, {%x:%u}, %p",
+	       ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+
+	if (ret != -ENOENT ||
+	    !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+		goto out;
+
+	inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	*fid = AFS_FS_I(inode)->fid;
+	_leave("= %p", inode);
+	return inode;
+
+out:
+	_leave("= %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
  * look up an entry in a directory
  */
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -520,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 
 	ret = afs_do_lookup(dir, dentry, &fid, key);
 	if (ret < 0) {
+		inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+		if (!IS_ERR(inode)) {
+			key_put(key);
+			goto success;
+		}
+
+		ret = PTR_ERR(inode);
 		key_put(key);
 		if (ret == -ENOENT) {
 			d_add(dentry, NULL);
@@ -539,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_CAST(inode);
 	}
 
+success:
 	dentry->d_op = &afs_fs_dentry_operations;
 
 	d_add(dentry, inode);
@@ -696,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
 		goto zap;
 
 	if (dentry->d_inode &&
-	    test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
-			goto zap;
+	    (test_bit(AFS_VNODE_DELETED,   &AFS_FS_I(dentry->d_inode)->flags) ||
+	     test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+		goto zap;
 
 	_leave(" = 0 [keep]");
 	return 0;
@@ -1002,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	if (ret < 0)
 		goto link_error;
 
-	atomic_inc(&vnode->vfs_inode.i_count);
+	ihold(&vnode->vfs_inode);
 	d_instantiate(dentry, &vnode->vfs_inode);
 	key_put(key);
 	_leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325e..757d664575d 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/smp_lock.h>
 #include "internal.h"
 
 #define AFS_LOCK_GRANTED	0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	lock_kernel();
+	lock_flocks();
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
 	afs_vnode_fetch_status(vnode, NULL, key);
 
 error:
-	unlock_kernel();
+	unlock_flocks();
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d00b312e311..0747339011c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
 #include "internal.h"
 
 struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
 }
 
 /*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+	return 0;
+}
+
+/*
  * iget5() inode initialiser
  */
 static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
 }
 
 /*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+				int namesz, struct key *key)
+{
+	struct afs_iget_data data;
+	struct afs_super_info *as;
+	struct afs_vnode *vnode;
+	struct super_block *sb;
+	struct inode *inode;
+	static atomic_t afs_autocell_ino;
+
+	_enter("{%x:%u},%*.*s,",
+	       AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+	       namesz, namesz, dev_name ?: "");
+
+	sb = dir->i_sb;
+	as = sb->s_fs_info;
+	data.volume = as->volume;
+	data.fid.vid = as->volume->vid;
+	data.fid.unique = 0;
+	data.fid.vnode = 0;
+
+	inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+			     afs_iget5_autocell_test, afs_iget5_set,
+			     &data);
+	if (!inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	_debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+	       inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+	       data.fid.unique);
+
+	vnode = AFS_FS_I(inode);
+
+	/* there shouldn't be an existing inode */
+	BUG_ON(!(inode->i_state & I_NEW));
+
+	inode->i_size		= 0;
+	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_op		= &afs_autocell_inode_operations;
+	inode->i_nlink		= 2;
+	inode->i_uid		= 0;
+	inode->i_gid		= 0;
+	inode->i_ctime.tv_sec	= get_seconds();
+	inode->i_ctime.tv_nsec	= 0;
+	inode->i_atime		= inode->i_mtime = inode->i_ctime;
+	inode->i_blocks		= 0;
+	inode->i_version	= 0;
+	inode->i_generation	= 0;
+
+	set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+	inode->i_flags |= S_NOATIME;
+	unlock_new_inode(inode);
+	_leave(" = %p", inode);
+	return inode;
+}
+
+/*
  * inode retrieval
  */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,9 +387,22 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 
 /*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+	_enter("");
+
+	if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+		return generic_delete_inode(inode);
+	else
+		return generic_drop_inode(inode);
+}
+
+/*
  * clear an AFS inode
  */
-void afs_clear_inode(struct inode *inode)
+void afs_evict_inode(struct inode *inode)
 {
 	struct afs_permits *permits;
 	struct afs_vnode *vnode;
@@ -335,6 +421,9 @@ void afs_clear_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
 	afs_give_up_callback(vnode);
 
 	if (vnode->server) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5f679b77ce2..cca8eef736f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
 struct afs_mount_params {
 	bool			rwpath;		/* T if the parent should be considered R/W */
 	bool			force;		/* T to force cell type */
+	bool			autocell;	/* T if set auto mount operation */
 	afs_voltype_t		type;		/* type of volume requested */
 	int			volnamesz;	/* size of volume name */
 	const char		*volname;	/* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
 #define AFS_VNODE_READLOCKED	7		/* set if vnode is read-locked on the server */
 #define AFS_VNODE_WRITELOCKED	8		/* set if vnode is write-locked on the server */
 #define AFS_VNODE_UNLOCKING	9		/* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL	10		/* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR	11		/* set if Vnode is a pseudo directory */
 
 	long			acl_order;	/* ACL check count (callback break count) */
 
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
 
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, char *);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
 extern struct afs_cell *afs_grab_cell(struct afs_cell *);
 extern void afs_put_cell(struct afs_cell *);
 extern void afs_cell_purge(void);
@@ -558,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
 /*
  * inode.c
  */
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+				       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_fid *, struct afs_file_status *,
 			      struct afs_callback *);
@@ -565,7 +570,8 @@ extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int afs_setattr(struct dentry *, struct iattr *);
-extern void afs_clear_inode(struct inode *);
+extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
 
 /*
  * main.c
@@ -581,6 +587,7 @@ extern int afs_abort_to_error(u32);
  * mntpt.c
  */
 extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
 
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -752,12 +759,6 @@ extern unsigned afs_debug;
 #define dbgprintk(FMT,...) \
 	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -792,9 +793,9 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
 #endif
 
 /*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c5..cfd1cbe25b2 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
 
 	/* initialise the callback update process */
 	ret = afs_callback_update_init();
+	if (ret < 0)
+		goto error_callback_update_init;
 
 	/* create the RxRPC transport */
 	ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
 error_fs:
 	afs_close_socket();
 error_open_socket:
+	afs_callback_update_kill();
+error_callback_update_init:
+	afs_vlocation_purge();
 error_vl_update_init:
+	afs_cell_purge();
 error_cell_init:
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-	afs_callback_update_kill();
-	afs_vlocation_purge();
-	afs_cell_purge();
 	afs_proc_cleanup();
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a9e23039ea3..6153417caf5 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
 const struct file_operations afs_mntpt_file_operations = {
 	.open		= afs_mntpt_open,
+	.llseek		= noop_llseek,
 };
 
 const struct inode_operations afs_mntpt_inode_operations = {
@@ -38,6 +39,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
 	.getattr	= afs_getattr,
 };
 
+const struct inode_operations afs_autocell_inode_operations = {
+	.follow_link	= afs_mntpt_follow_link,
+	.getattr	= afs_getattr,
+};
+
 static LIST_HEAD(afs_vfsmounts);
 static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
 
@@ -136,20 +142,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
 	struct afs_super_info *super;
 	struct vfsmount *mnt;
+	struct afs_vnode *vnode;
 	struct page *page;
-	size_t size;
-	char *buf, *devname, *options;
+	char *devname, *options;
+	bool rwpath = false;
 	int ret;
 
 	_enter("{%s}", mntpt->d_name.name);
 
 	BUG_ON(!mntpt->d_inode);
 
-	ret = -EINVAL;
-	size = mntpt->d_inode->i_size;
-	if (size > PAGE_SIZE - 1)
-		goto error_no_devname;
-
 	ret = -ENOMEM;
 	devname = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!devname)
@@ -159,28 +161,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	if (!options)
 		goto error_no_options;
 
-	/* read the contents of the AFS special symlink */
-	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
-	if (IS_ERR(page)) {
-		ret = PTR_ERR(page);
-		goto error_no_page;
+	vnode = AFS_FS_I(mntpt->d_inode);
+	if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
+		/* if the directory is a pseudo directory, use the d_name */
+		static const char afs_root_cell[] = ":root.cell.";
+		unsigned size = mntpt->d_name.len;
+
+		ret = -ENOENT;
+		if (size < 2 || size > AFS_MAXCELLNAME)
+			goto error_no_page;
+
+		if (mntpt->d_name.name[0] == '.') {
+			devname[0] = '#';
+			memcpy(devname + 1, mntpt->d_name.name, size - 1);
+			memcpy(devname + size, afs_root_cell,
+			       sizeof(afs_root_cell));
+			rwpath = true;
+		} else {
+			devname[0] = '%';
+			memcpy(devname + 1, mntpt->d_name.name, size);
+			memcpy(devname + size + 1, afs_root_cell,
+			       sizeof(afs_root_cell));
+		}
+	} else {
+		/* read the contents of the AFS special symlink */
+		loff_t size = i_size_read(mntpt->d_inode);
+		char *buf;
+
+		ret = -EINVAL;
+		if (size > PAGE_SIZE - 1)
+			goto error_no_page;
+
+		page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto error_no_page;
+		}
+
+		ret = -EIO;
+		if (PageError(page))
+			goto error;
+
+		buf = kmap_atomic(page, KM_USER0);
+		memcpy(devname, buf, size);
+		kunmap_atomic(buf, KM_USER0);
+		page_cache_release(page);
+		page = NULL;
 	}
 
-	ret = -EIO;
-	if (PageError(page))
-		goto error;
-
-	buf = kmap_atomic(page, KM_USER0);
-	memcpy(devname, buf, size);
-	kunmap_atomic(buf, KM_USER0);
-	page_cache_release(page);
-	page = NULL;
-
 	/* work out what options we want */
 	super = AFS_FS_S(mntpt->d_sb);
 	memcpy(options, "cell=", 5);
 	strcpy(options + 5, super->volume->cell->name);
-	if (super->volume->type == AFSVL_RWVOL)
+	if (super->volume->type == AFSVL_RWVOL || rwpath)
 		strcat(options, ",rwpath");
 
 	/* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a..096b23f821a 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 	if (strcmp(kbuf, "add") == 0) {
 		struct afs_cell *cell;
 
-		cell = afs_cell_create(name, args);
+		cell = afs_cell_create(name, strlen(name), args, false);
 		if (IS_ERR(cell)) {
 			ret = PTR_ERR(cell);
 			goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd..654d8fdbf01 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
 	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
 	if (ret < 0) {
 		sock_release(socket);
+		destroy_workqueue(afs_async_calls);
 		_leave(" = %d [bind]", ret);
 		return ret;
 	}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e932e5a3a0c..27201cffece 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,9 +16,9 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
@@ -29,9 +29,8 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
 static void afs_i_init_once(void *foo);
-static int afs_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name,
-		      void *data, struct vfsmount *mnt);
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
@@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
-	.get_sb		= afs_get_sb,
+	.mount		= afs_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= 0,
 };
@@ -48,8 +47,9 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
+	.drop_inode	= afs_drop_inode,
 	.destroy_inode	= afs_destroy_inode,
-	.clear_inode	= afs_clear_inode,
+	.evict_inode	= afs_evict_inode,
 	.put_super	= afs_put_super,
 	.show_options	= generic_show_options,
 };
@@ -62,12 +62,14 @@ enum {
 	afs_opt_cell,
 	afs_opt_rwpath,
 	afs_opt_vol,
+	afs_opt_autocell,
 };
 
 static const match_table_t afs_options_list = {
 	{ afs_opt_cell,		"cell=%s"	},
 	{ afs_opt_rwpath,	"rwpath"	},
 	{ afs_opt_vol,		"vol=%s"	},
+	{ afs_opt_autocell,	"autocell"	},
 	{ afs_no_opt,		NULL		},
 };
 
@@ -151,7 +153,8 @@ static int afs_parse_options(struct afs_mount_params *params,
 		switch (token) {
 		case afs_opt_cell:
 			cell = afs_cell_lookup(args[0].from,
-					       args[0].to - args[0].from);
+					       args[0].to - args[0].from,
+					       false);
 			if (IS_ERR(cell))
 				return PTR_ERR(cell);
 			afs_put_cell(params->cell);
@@ -166,6 +169,10 @@ static int afs_parse_options(struct afs_mount_params *params,
 			*devname = args[0].from;
 			break;
 
+		case afs_opt_autocell:
+			params->autocell = 1;
+			break;
+
 		default:
 			printk(KERN_ERR "kAFS:"
 			       " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +259,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 
 	/* lookup the cell record */
 	if (cellname || !params->cell) {
-		cell = afs_cell_lookup(cellname, cellnamesz);
+		cell = afs_cell_lookup(cellname, cellnamesz, true);
 		if (IS_ERR(cell)) {
-			printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
-			       cellname ?: "");
+			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+			       cellnamesz, cellnamesz, cellname ?: "");
 			return PTR_ERR(cell);
 		}
 		afs_put_cell(params->cell);
@@ -321,6 +328,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	if (IS_ERR(inode))
 		goto error_inode;
 
+	if (params->autocell)
+		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
+
 	ret = -ENOMEM;
 	root = d_alloc_root(inode);
 	if (!root)
@@ -348,11 +358,8 @@ error:
 /*
  * get an AFS superblock
  */
-static int afs_get_sb(struct file_system_type *fs_type,
-		      int flags,
-		      const char *dev_name,
-		      void *options,
-		      struct vfsmount *mnt)
+static struct dentry *afs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *options)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -416,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
 	}
 
-	simple_set_mnt(mnt, sb);
 	afs_put_volume(params.volume);
 	afs_put_cell(params.cell);
 	kfree(new_opts);
 	_leave(" = 0 [%p]", sb);
-	return 0;
+	return dget(sb->s_root);
 
 error:
 	afs_put_volume(params.volume);
@@ -429,7 +435,7 @@ error:
 	key_put(params.key);
 	kfree(new_opts);
 	_leave(" = %d", ret);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /*
@@ -441,12 +447,8 @@ static void afs_put_super(struct super_block *sb)
 
 	_enter("");
 
-	lock_kernel();
-
 	afs_put_volume(as->volume);
 
-	unlock_kernel();
-
 	_leave("");
 }
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d..15690bb1d3b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
  */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	int ret;
 
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	wbc->nr_to_write -= ret;
-	if (wbc->nonblocking && bdi_write_congested(bdi))
-		wbc->encountered_congestion = 1;
 
 	_leave(" = 0");
 	return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
 				 struct writeback_control *wbc,
 				 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct afs_writeback *wb;
 	struct page *page;
 	int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		wbc->nr_to_write -= ret;
 
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			break;
-		}
-
 		cond_resched();
 	} while (index < end && wbc->nr_to_write > 0);
 
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
 		   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	pgoff_t start, end, next;
 	int ret;
 
 	_enter("");
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		_leave(" = 0 [congest]");
-		return 0;
-	}
-
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index;
 		end = -1;
 		ret = afs_writepages_region(mapping, wbc, start, end, &next);
-		if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
-		    !(wbc->nonblocking && wbc->encountered_congestion))
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
 			ret = afs_writepages_region(mapping, wbc, 0, start,
 						    &next);
 		mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 1ccf25cef1f..8c8f6c5b6d7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	 */
 	ret = retry(iocb);
 
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
 		aio_complete(iocb, ret, 0);
+	}
 out:
 	spin_lock_irq(&ctx->ctx_lock);
 
@@ -1277,7 +1285,7 @@ out:
 /* sys_io_destroy:
  *	Destroy the aio_context specified.  May cancel any outstanding 
  *	AIOs and block on completion.  Will fail with -ENOSYS if not
- *	implemented.  May fail with -EFAULT if the context pointed to
+ *	implemented.  May fail with -EINVAL if the context pointed to
  *	is invalid.
  */
 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1535,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
 	}
 
 	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-	BUG_ON(!igrab(mapping->host));
+
+	/*
+	 * we should be using igrab here, but
+	 * we don't want to hammer on the global
+	 * inode spinlock just to take an extra
+	 * reference on a file that we must already
+	 * have a reference to.
+	 *
+	 * When we're called, we always have a reference
+	 * on the file, so we must always have a reference
+	 * on the inode, so ihold() is safe here.
+	 */
+	ihold(mapping->host);
 	abe->mapping = mapping;
 	hlist_add_head(&abe->list, &batch_hash[bucket]);
 	return;
@@ -1659,6 +1679,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	if (unlikely(nr < 0))
 		return -EINVAL;
 
+	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+		nr = LONG_MAX/sizeof(*iocbpp);
+
 	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
 		return -EFAULT;
 
@@ -1795,15 +1818,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 
 /* io_getevents:
  *	Attempts to read at least min_nr events and up to nr events from
- *	the completion queue for the aio_context specified by ctx_id.  May
- *	fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
- *	if nr is out of range, if when is out of range.  May fail with
- *	-EFAULT if any of the memory specified to is invalid.  May return
- *	0 or < min_nr if no events are available and the timeout specified
- *	by when	has elapsed, where when == NULL specifies an infinite
- *	timeout.  Note that the timeout pointed to by when is relative and
- *	will be updated if not NULL and the operation blocks.  Will fail
- *	with -ENOSYS if not implemented.
+ *	the completion queue for the aio_context specified by ctx_id. If
+ *	it succeeds, the number of read events is returned. May fail with
+ *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
+ *	out of range, if timeout is out of range.  May fail with -EFAULT
+ *	if any of the memory specified is invalid.  May return 0 or
+ *	< min_nr if the timeout specified by timeout has elapsed
+ *	before sufficient events are available, where timeout == NULL
+ *	specifies an infinite timeout. Note that the timeout pointed to by
+ *	timeout is relative and will be updated if not NULL and the
+ *	operation blocks. Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		long, min_nr,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda8..57ce55b2564 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
 
-static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
-			       const char *dev_name, void *data,
-			       struct vfsmount *mnt)
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
-			     mnt);
+	return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
 }
 
 /*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 
 static struct file_system_type anon_inode_fs_type = {
 	.name		= "anon_inodefs",
-	.get_sb		= anon_inodefs_get_sb,
+	.mount		= anon_inodefs_mount,
 	.kill_sb	= kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
@@ -111,10 +109,9 @@ struct file *anon_inode_getfile(const char *name,
 	path.mnt = mntget(anon_inode_mnt);
 	/*
 	 * We know the anon_inode inode count is always greater than zero,
-	 * so we can avoid doing an igrab() and we can use an open-coded
-	 * atomic_inc().
+	 * so ihold() is safe.
 	 */
-	atomic_inc(&anon_inode_inode->i_count);
+	ihold(anon_inode_inode);
 
 	path.dentry->d_op = &anon_inodefs_dentry_operations;
 	d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +191,7 @@ static struct inode *anon_inode_mkinode(void)
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	inode->i_ino = get_next_ino();
 	inode->i_fop = &anon_inode_fops;
 
 	inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/attr.c b/fs/attr.c
index b4fa3b0aa59..7ca41811afa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,35 +14,53 @@
 #include <linux/fcntl.h>
 #include <linux/security.h>
 
-/* Taken over from the old code... */
-
-/* POSIX UID/GID verification for setting inode attributes. */
+/**
+ * inode_change_ok - check if attribute changes to an inode are allowed
+ * @inode:	inode to check
+ * @attr:	attributes to change
+ *
+ * Check if we are allowed to change the attributes contained in @attr
+ * in the given inode.  This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others.
+ *
+ * Should be called as the first thing in ->setattr implementations,
+ * possibly after taking additional locks.
+ */
 int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
-	int retval = -EPERM;
 	unsigned int ia_valid = attr->ia_valid;
 
+	/*
+	 * First check size constraints.  These can't be overriden using
+	 * ATTR_FORCE.
+	 */
+	if (ia_valid & ATTR_SIZE) {
+		int error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
 	/* If force is set do it anyway. */
 	if (ia_valid & ATTR_FORCE)
-		goto fine;
+		return 0;
 
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
 	    (current_fsuid() != inode->i_uid ||
 	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
-		goto error;
+		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (current_fsuid() != inode->i_uid ||
 	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
 	    !capable(CAP_CHOWN))
-		goto error;
+		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
 		if (!is_owner_or_cap(inode))
-			goto error;
+			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) && !capable(CAP_FSETID))
@@ -52,12 +70,10 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
 		if (!is_owner_or_cap(inode))
-			goto error;
+			return -EPERM;
 	}
-fine:
-	retval = 0;
-error:
-	return retval;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_change_ok);
 
@@ -105,21 +121,21 @@ out_big:
 EXPORT_SYMBOL(inode_newsize_ok);
 
 /**
- * generic_setattr - copy simple metadata updates into the generic inode
+ * setattr_copy - copy simple metadata updates into the generic inode
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
- * generic_setattr must be called with i_mutex held.
+ * setattr_copy must be called with i_mutex held.
  *
- * generic_setattr updates the inode's metadata with that specified
+ * setattr_copy updates the inode's metadata with that specified
  * in attr. Noticably missing is inode size update, which is more complex
- * as it requires pagecache updates. See simple_setsize.
+ * as it requires pagecache updates.
  *
  * The inode is not marked as dirty after this operation. The rationale is
  * that for "simple" filesystems, the struct inode is the inode storage.
  * The caller is free to mark the inode dirty afterwards if needed.
  */
-void generic_setattr(struct inode *inode, const struct iattr *attr)
+void setattr_copy(struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -144,32 +160,7 @@ void generic_setattr(struct inode *inode, const struct iattr *attr)
 		inode->i_mode = mode;
 	}
 }
-EXPORT_SYMBOL(generic_setattr);
-
-/*
- * note this function is deprecated, the new truncate sequence should be
- * used instead -- see eg. simple_setsize, generic_setattr.
- */
-int inode_setattr(struct inode *inode, const struct iattr *attr)
-{
-	unsigned int ia_valid = attr->ia_valid;
-
-	if (ia_valid & ATTR_SIZE &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
-	generic_setattr(inode, attr);
-
-	mark_inode_dirty(inode);
-
-	return 0;
-}
-EXPORT_SYMBOL(inode_setattr);
+EXPORT_SYMBOL(setattr_copy);
 
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
@@ -237,13 +228,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	if (ia_valid & ATTR_SIZE)
 		down_write(&dentry->d_inode->i_alloc_sem);
 
-	if (inode->i_op && inode->i_op->setattr) {
+	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
-	} else {
-		error = inode_change_ok(inode, attr);
-		if (!error)
-			error = inode_setattr(inode, attr);
-	}
+	else
+		error = simple_setattr(dentry, attr);
 
 	if (ia_valid & ATTR_SIZE)
 		up_write(&dentry->d_inode->i_alloc_sem);
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911..00000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
-config AUTOFS_FS
-	tristate "Kernel automounter support"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from the autofs
-	  package; you can find the location in <file:Documentation/Changes>.
-	  You also want to answer Y to "NFS file system support", below.
-
-	  If you want to use the newer version of the automounter with more
-	  features, say N here and say Y to "Kernel automounter v4 support",
-	  below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs.
-
-	  If you are not a part of a fairly large, distributed network, you
-	  probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d0..00000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux autofs-filesystem routines.
-#
-
-obj-$(CONFIG_AUTOFS_FS) += autofs.o
-
-autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec4..00000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/* Internal header file for autofs */
-
-#include <linux/auto_fs.h>
-
-/* This is the range of ioctl() numbers we claim as ours */
-#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
-#define AUTOFS_IOC_COUNT     32
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-
-#include <asm/current.h>
-#include <asm/uaccess.h>
-
-#ifdef DEBUG
-#define DPRINTK(D) (printk D)
-#else
-#define DPRINTK(D) ((void)0)
-#endif
-
-/*
- * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
- * kernel will keep the negative response cached for up to the time given
- * here, although the time can be shorter if the kernel throws the dcache
- * entry away.  This probably should be settable from user space.
- */
-#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ)	/* 1 minute */
-
-/* Structures associated with the root directory hash table */
-
-#define AUTOFS_HASH_SIZE 67
-
-struct autofs_dir_ent {
-	int hash;
-	char *name;
-	int len;
-	ino_t ino;
-	struct dentry *dentry;
-	/* Linked list of entries */
-	struct autofs_dir_ent *next;
-	struct autofs_dir_ent **back;
-	/* The following entries are for the expiry system */
-	unsigned long last_usage;
-	struct list_head exp;
-};
-
-struct autofs_dirhash {
-	struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
-	struct list_head expiry_head;
-};
-
-struct autofs_wait_queue {
-	wait_queue_head_t queue;
-	struct autofs_wait_queue *next;
-	autofs_wqt_t wait_queue_token;
-	/* We use the following to see what we are waiting for */
-	int hash;
-	int len;
-	char *name;
-	/* This is for status reporting upon return */
-	int status;
-	int wait_ctr;
-};
-
-struct autofs_symlink {
-	char *data;
-	int len;
-	time_t mtime;
-};
-
-#define AUTOFS_MAX_SYMLINKS 256
-
-#define AUTOFS_ROOT_INO      1
-#define AUTOFS_FIRST_SYMLINK 2
-#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
-
-#define AUTOFS_SYMLINK_BITMAP_LEN \
-	((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
-
-#define AUTOFS_SBI_MAGIC 0x6d4a556d
-
-struct autofs_sb_info {
-	u32 magic;
-	struct file *pipe;
-	struct pid *oz_pgrp;
-	int catatonic;
-	struct super_block *sb;
-	unsigned long exp_timeout;
-	ino_t next_dir_ino;
-	struct autofs_wait_queue *queues; /* Wait queue pointer */
-	struct autofs_dirhash dirhash; /* Root directory hash */
-	struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
-	unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
-};
-
-static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
-{
-	return (struct autofs_sb_info *)(sb->s_fs_info);
-}
-
-/* autofs_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-
-static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
-}
-
-/* Hash operations */
-
-void autofs_initialize_hash(struct autofs_dirhash *);
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
-void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
-void autofs_hash_delete(struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
-void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_sb_info *);
-
-/* Expiration-handling functions */
-
-void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
-
-/* Operations structures */
-
-extern const struct inode_operations autofs_root_inode_operations;
-extern const struct inode_operations autofs_symlink_inode_operations;
-extern const struct file_operations autofs_root_operations;
-
-/* Initializing function */
-
-int autofs_fill_super(struct super_block *, void *, int);
-void autofs_kill_sb(struct super_block *sb);
-struct inode *autofs_iget(struct super_block *, unsigned long);
-
-/* Queue management functions */
-
-int autofs_wait(struct autofs_sb_info *,struct qstr *);
-int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
-void autofs_catatonic_mode(struct autofs_sb_info *);
-
-#ifdef DEBUG
-void autofs_say(const char *name, int len);
-#else
-#define autofs_say(n,l) ((void)0)
-#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e..00000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/dirhash.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include "autofs_i.h"
-
-/* Functions for maintenance of expiry queue */
-
-static void autofs_init_usage(struct autofs_dirhash *dh,
-			      struct autofs_dir_ent *ent)
-{
-	list_add_tail(&ent->exp, &dh->expiry_head);
-	ent->last_usage = jiffies;
-}
-
-static void autofs_delete_usage(struct autofs_dir_ent *ent)
-{
-	list_del(&ent->exp);
-}
-
-void autofs_update_usage(struct autofs_dirhash *dh,
-			 struct autofs_dir_ent *ent)
-{
-	autofs_delete_usage(ent);   /* Unlink from current position */
-	autofs_init_usage(dh,ent);  /* Relink at queue tail */
-}
-
-struct autofs_dir_ent *autofs_expire(struct super_block *sb,
-				     struct autofs_sb_info *sbi,
-				     struct vfsmount *mnt)
-{
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned long timeout = sbi->exp_timeout;
-
-	while (1) {
-		struct path path;
-		int umount_ok;
-
-		if ( list_empty(&dh->expiry_head) || sbi->catatonic )
-			return NULL;	/* No entries */
-		/* We keep the list sorted by last_usage and want old stuff */
-		ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
-		if (jiffies - ent->last_usage < timeout)
-			break;
-		/* Move to end of list in case expiry isn't desirable */
-		autofs_update_usage(dh, ent);
-
-		/* Check to see that entry is expirable */
-		if ( ent->ino < AUTOFS_FIRST_DIR_INO )
-			return ent; /* Symlinks are always expirable */
-
-		/* Get the dentry for the autofs subdirectory */
-		path.dentry = ent->dentry;
-
-		if (!path.dentry) {
-			/* Should only happen in catatonic mode */
-			printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
-			autofs_delete_usage(ent);
-			continue;
-		}
-
-		if (!path.dentry->d_inode) {
-			dput(path.dentry);
-			printk("autofs: negative dentry on expiry queue: %s\n",
-			       ent->name);
-			autofs_delete_usage(ent);
-			continue;
-		}
-
-		/* Make sure entry is mounted and unused; note that dentry will
-		   point to the mounted-on-top root. */
-		if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
-		    !d_mountpoint(path.dentry)) {
-			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-			continue;
-		}
-		path.mnt = mnt;
-		path_get(&path);
-		if (!follow_down(&path)) {
-			path_put(&path);
-			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-			continue;
-		}
-		while (d_mountpoint(path.dentry) && follow_down(&path))
-			;
-		umount_ok = may_umount(path.mnt);
-		path_put(&path);
-
-		if (umount_ok) {
-			DPRINTK(("autofs: signaling expire on %s\n", ent->name));
-			return ent; /* Expirable! */
-		}
-		DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-	}
-	return NULL;		/* No expirable entries */
-}
-
-void autofs_initialize_hash(struct autofs_dirhash *dh) {
-	memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
-	INIT_LIST_HEAD(&dh->expiry_head);
-}
-
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
-{
-	struct autofs_dir_ent *dhn;
-
-	DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
-	autofs_say(name->name,name->len);
-
-	for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
-		if ( name->hash == dhn->hash &&
-		     name->len == dhn->len &&
-		     !memcmp(name->name, dhn->name, name->len) )
-			break;
-	}
-
-	return dhn;
-}
-
-void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
-{
-	struct autofs_dir_ent **dhnp;
-
-	DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
-	autofs_say(ent->name,ent->len);
-
-	autofs_init_usage(dh,ent);
-	if (ent->dentry)
-		dget(ent->dentry);
-
-	dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
-	ent->next = *dhnp;
-	ent->back = dhnp;
-	*dhnp = ent;
-	if ( ent->next )
-		ent->next->back = &(ent->next);
-}
-
-void autofs_hash_delete(struct autofs_dir_ent *ent)
-{
-	*(ent->back) = ent->next;
-	if ( ent->next )
-		ent->next->back = ent->back;
-
-	autofs_delete_usage(ent);
-
-	if ( ent->dentry )
-		dput(ent->dentry);
-	kfree(ent->name);
-	kfree(ent);
-}
-
-/*
- * Used by readdir().  We must validate "ptr", so we can't simply make it
- * a pointer.  Values below 0xffff are reserved; calling with any value
- * <= 0x10000 will return the first entry found.
- *
- * "last" can be NULL or the value returned by the last search *if* we
- * want the next sequential entry.
- */
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
-					off_t *ptr, struct autofs_dir_ent *last)
-{
-	int bucket, ecount, i;
-	struct autofs_dir_ent *ent;
-
-	bucket = (*ptr >> 16) - 1;
-	ecount = *ptr & 0xffff;
-
-	if ( bucket < 0 ) {
-		bucket = ecount = 0;
-	} 
-
-	DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
-
-	ent = last ? last->next : NULL;
-
-	if ( ent ) {
-		ecount++;
-	} else {
-		while  ( bucket < AUTOFS_HASH_SIZE ) {
-			ent = dh->h[bucket];
-			for ( i = ecount ; ent && i ; i-- )
-				ent = ent->next;
-			
-			if (ent) {
-				ecount++; /* Point to *next* entry */
-				break;
-			}
-			
-			bucket++; ecount = 0;
-		}
-	}
-
-#ifdef DEBUG
-	if ( !ent )
-		printk("autofs_hash_enum: nothing found\n");
-	else {
-		printk("autofs_hash_enum: found hash %08x, name", ent->hash);
-		autofs_say(ent->name,ent->len);
-	}
-#endif
-
-	*ptr = ((bucket+1) << 16) + ecount;
-	return ent;
-}
-
-/* Iterate over all the ents, and remove all dentry pointers.  Used on
-   entering catatonic mode, in order to make the filesystem unmountable. */
-void autofs_hash_dputall(struct autofs_dirhash *dh)
-{
-	int i;
-	struct autofs_dir_ent *ent;
-
-	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
-			if ( ent->dentry ) {
-				dput(ent->dentry);
-				ent->dentry = NULL;
-			}
-		}
-	}
-}
-
-/* Delete everything.  This is used on filesystem destruction, so we
-   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_sb_info *sbi)
-{
-	int i;
-	struct autofs_dir_ent *ent, *nent;
-
-	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
-			nent = ent->next;
-			if ( ent->dentry )
-				dput(ent->dentry);
-			kfree(ent->name);
-			kfree(ent);
-		}
-	}
-}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f3..00000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include "autofs_i.h"
-
-static int autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
-}
-
-static struct file_system_type autofs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "autofs",
-	.get_sb		= autofs_get_sb,
-	.kill_sb	= autofs_kill_sb,
-};
-
-static int __init init_autofs_fs(void)
-{
-	return register_filesystem(&autofs_fs_type);
-}
-
-static void __exit exit_autofs_fs(void)
-{
-	unregister_filesystem(&autofs_fs_type);
-}
-
-module_init(init_autofs_fs);
-module_exit(exit_autofs_fs);
-
-#ifdef DEBUG
-void autofs_say(const char *name, int len)
-{
-	printk("(%d: ", len);
-	while ( len-- )
-		printk("%c", *name++);
-	printk(")\n");
-}
-#endif
-MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e2..00000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/parser.h>
-#include <linux/bitops.h>
-#include <linux/magic.h>
-#include "autofs_i.h"
-#include <linux/module.h>
-
-void autofs_kill_sb(struct super_block *sb)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(sb);
-	unsigned int n;
-
-	/*
-	 * In the event of a failure in get_sb_nodev the superblock
-	 * info is not present so nothing else has been setup, so
-	 * just call kill_anon_super when we are called from
-	 * deactivate_super.
-	 */
-	if (!sbi)
-		goto out_kill_sb;
-
-	if (!sbi->catatonic)
-		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-
-	put_pid(sbi->oz_pgrp);
-
-	autofs_hash_nuke(sbi);
-	for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
-		if (test_bit(n, sbi->symlink_bitmap))
-			kfree(sbi->symlink[n].data);
-	}
-
-	kfree(sb->s_fs_info);
-
-out_kill_sb:
-	DPRINTK(("autofs: shutting down\n"));
-	kill_anon_super(sb);
-}
-
-static const struct super_operations autofs_sops = {
-	.statfs		= simple_statfs,
-	.show_options	= generic_show_options,
-};
-
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-
-static const match_table_t autofs_tokens = {
-	{Opt_fd, "fd=%u"},
-	{Opt_uid, "uid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_pgrp, "pgrp=%u"},
-	{Opt_minproto, "minproto=%u"},
-	{Opt_maxproto, "maxproto=%u"},
-	{Opt_err, NULL}
-};
-
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-		pid_t *pgrp, int *minproto, int *maxproto)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-
-	*uid = current_uid();
-	*gid = current_gid();
-	*pgrp = task_pgrp_nr(current);
-
-	*minproto = *maxproto = AUTOFS_PROTO_VERSION;
-
-	*pipefd = -1;
-
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-
-		token = match_token(p, autofs_tokens, args);
-		switch (token) {
-		case Opt_fd:
-			if (match_int(&args[0], &option))
-				return 1;
-			*pipefd = option;
-			break;
-		case Opt_uid:
-			if (match_int(&args[0], &option))
-				return 1;
-			*uid = option;
-			break;
-		case Opt_gid:
-			if (match_int(&args[0], &option))
-				return 1;
-			*gid = option;
-			break;
-		case Opt_pgrp:
-			if (match_int(&args[0], &option))
-				return 1;
-			*pgrp = option;
-			break;
-		case Opt_minproto:
-			if (match_int(&args[0], &option))
-				return 1;
-			*minproto = option;
-			break;
-		case Opt_maxproto:
-			if (match_int(&args[0], &option))
-				return 1;
-			*maxproto = option;
-			break;
-		default:
-			return 1;
-		}
-	}
-	return (*pipefd < 0);
-}
-
-int autofs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct inode * root_inode;
-	struct dentry * root;
-	struct file * pipe;
-	int pipefd;
-	struct autofs_sb_info *sbi;
-	int minproto, maxproto;
-	pid_t pgid;
-
-	save_mount_options(s, data);
-
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		goto fail_unlock;
-	DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
-
-	s->s_fs_info = sbi;
-	sbi->magic = AUTOFS_SBI_MAGIC;
-	sbi->pipe = NULL;
-	sbi->catatonic = 1;
-	sbi->exp_timeout = 0;
-	autofs_initialize_hash(&sbi->dirhash);
-	sbi->queues = NULL;
-	memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
-	sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
-	s->s_magic = AUTOFS_SUPER_MAGIC;
-	s->s_op = &autofs_sops;
-	s->s_time_gran = 1;
-	sbi->sb = s;
-
-	root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
-	if (IS_ERR(root_inode))
-		goto fail_free;
-	root = d_alloc_root(root_inode);
-	pipe = NULL;
-
-	if (!root)
-		goto fail_iput;
-
-	/* Can this call block?  - WTF cares? s is locked. */
-	if (parse_options(data, &pipefd, &root_inode->i_uid,
-				&root_inode->i_gid, &pgid, &minproto,
-				&maxproto)) {
-		printk("autofs: called with bogus options\n");
-		goto fail_dput;
-	}
-
-	/* Couldn't this be tested earlier? */
-	if (minproto > AUTOFS_PROTO_VERSION ||
-	     maxproto < AUTOFS_PROTO_VERSION) {
-		printk("autofs: kernel does not match daemon version\n");
-		goto fail_dput;
-	}
-
-	DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
-	sbi->oz_pgrp = find_get_pid(pgid);
-
-	if (!sbi->oz_pgrp) {
-		printk("autofs: could not find process group %d\n", pgid);
-		goto fail_dput;
-	}
-
-	pipe = fget(pipefd);
-	
-	if (!pipe) {
-		printk("autofs: could not open pipe file descriptor\n");
-		goto fail_put_pid;
-	}
-
-	if (!pipe->f_op || !pipe->f_op->write)
-		goto fail_fput;
-	sbi->pipe = pipe;
-	sbi->catatonic = 0;
-
-	/*
-	 * Success! Install the root dentry now to indicate completion.
-	 */
-	s->s_root = root;
-	return 0;
-
-fail_fput:
-	printk("autofs: pipe file descriptor does not contain proper ops\n");
-	fput(pipe);
-fail_put_pid:
-	put_pid(sbi->oz_pgrp);
-fail_dput:
-	dput(root);
-	goto fail_free;
-fail_iput:
-	printk("autofs: get root dentry failed\n");
-	iput(root_inode);
-fail_free:
-	kfree(sbi);
-	s->s_fs_info = NULL;
-fail_unlock:
-	return -EINVAL;
-}
-
-struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
-{
-	unsigned int n;
-	struct autofs_sb_info *sbi = autofs_sbi(sb);
-	struct inode *inode;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
-
-	/* Initialize to the default case (stub directory) */
-
-	inode->i_op = &simple_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
-	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
-	if (ino == AUTOFS_ROOT_INO) {
-		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
-		inode->i_op = &autofs_root_inode_operations;
-		inode->i_fop = &autofs_root_operations;
-		goto done;
-	} 
-	
-	inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
-	inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
-	
-	if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
-		/* Symlink inode - should be in symlink list */
-		struct autofs_symlink *sl;
-
-		n = ino - AUTOFS_FIRST_SYMLINK;
-		if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
-			printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
-			goto done;
-		}
-		
-		inode->i_op = &autofs_symlink_inode_operations;
-		sl = &sbi->symlink[n];
-		inode->i_private = sl;
-		inode->i_mode = S_IFLNK | S_IRWXUGO;
-		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
-		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-		inode->i_size = sl->len;
-		inode->i_nlink = 1;
-	}
-
-done:
-	unlock_new_inode(inode);
-	return inode;
-}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 9a0520b5066..00000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,582 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
-#include <linux/smp_lock.h>
-#include "autofs_i.h"
-
-static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
-static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
-static int autofs_root_unlink(struct inode *,struct dentry *);
-static int autofs_root_rmdir(struct inode *,struct dentry *);
-static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
-
-const struct file_operations autofs_root_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= autofs_root_readdir,
-	.ioctl		= autofs_root_ioctl,
-};
-
-const struct inode_operations autofs_root_inode_operations = {
-        .lookup		= autofs_root_lookup,
-        .unlink		= autofs_root_unlink,
-        .symlink	= autofs_root_symlink,
-        .mkdir		= autofs_root_mkdir,
-        .rmdir		= autofs_root_rmdir,
-};
-
-static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct autofs_dir_ent *ent = NULL;
-	struct autofs_dirhash *dirhash;
-	struct autofs_sb_info *sbi;
-	struct inode * inode = filp->f_path.dentry->d_inode;
-	off_t onr, nr;
-
-	lock_kernel();
-
-	sbi = autofs_sbi(inode->i_sb);
-	dirhash = &sbi->dirhash;
-	nr = filp->f_pos;
-
-	switch(nr)
-	{
-	case 0:
-		if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = ++nr;
-		/* fall through */
-	case 1:
-		if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = ++nr;
-		/* fall through */
-	default:
-		while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
-			if (!ent->dentry || d_mountpoint(ent->dentry)) {
-				if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
-					goto out;
-				filp->f_pos = nr;
-			}
-		}
-		break;
-	}
-
-out:
-	unlock_kernel();
-	return 0;
-}
-
-static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
-{
-	struct inode * inode;
-	struct autofs_dir_ent *ent;
-	int status = 0;
-
-	if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
-		do {
-			if (status && dentry->d_inode) {
-				if (status != -ENOENT)
-					printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
-				return 0; /* Try to get the kernel to invalidate this dentry */
-			}
-
-			/* Turn this into a real negative dentry? */
-			if (status == -ENOENT) {
-				dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
-				dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-				return 1;
-			} else if (status) {
-				/* Return a negative dentry, but leave it "pending" */
-				return 1;
-			}
-			status = autofs_wait(sbi, &dentry->d_name);
-		} while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
-	}
-
-	/* Abuse this field as a pointer to the directory entry, used to
-	   find the expire list pointers */
-	dentry->d_time = (unsigned long) ent;
-	
-	if (!dentry->d_inode) {
-		inode = autofs_iget(sb, ent->ino);
-		if (IS_ERR(inode)) {
-			/* Failed, but leave pending for next time */
-			return 1;
-		}
-		dentry->d_inode = inode;
-	}
-
-	/* If this is a directory that isn't a mount point, bitch at the
-	   daemon and fix it in user space */
-	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-		return !autofs_wait(sbi, &dentry->d_name);
-	}
-
-	/* We don't update the usages for the autofs daemon itself, this
-	   is necessary for recursive autofs mounts */
-	if (!autofs_oz_mode(sbi)) {
-		autofs_update_usage(&sbi->dirhash,ent);
-	}
-
-	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-	return 1;
-}
-
-
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
-{
-	struct inode * dir;
-	struct autofs_sb_info *sbi;
-	struct autofs_dir_ent *ent;
-	int res;
-
-	lock_kernel();
-	dir = dentry->d_parent->d_inode;
-	sbi = autofs_sbi(dir->i_sb);
-
-	/* Pending dentry */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-		if (autofs_oz_mode(sbi))
-			res = 1;
-		else
-			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-		unlock_kernel();
-		return res;
-	}
-
-	/* Negative dentry.. invalidate if "old" */
-	if (!dentry->d_inode) {
-		unlock_kernel();
-		return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
-	}
-		
-	/* Check for a non-mountpoint directory */
-	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-		if (autofs_oz_mode(sbi))
-			res = 1;
-		else
-			res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-		unlock_kernel();
-		return res;
-	}
-
-	/* Update the usage list */
-	if (!autofs_oz_mode(sbi)) {
-		ent = (struct autofs_dir_ent *) dentry->d_time;
-		if (ent)
-			autofs_update_usage(&sbi->dirhash,ent);
-	}
-	unlock_kernel();
-	return 1;
-}
-
-static const struct dentry_operations autofs_dentry_operations = {
-	.d_revalidate	= autofs_revalidate,
-};
-
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct autofs_sb_info *sbi;
-	int oz_mode;
-
-	DPRINTK(("autofs_root_lookup: name = "));
-	lock_kernel();
-	autofs_say(dentry->d_name.name,dentry->d_name.len);
-
-	if (dentry->d_name.len > NAME_MAX) {
-		unlock_kernel();
-		return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
-	}
-
-	sbi = autofs_sbi(dir->i_sb);
-
-	oz_mode = autofs_oz_mode(sbi);
-	DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
-				"oz_mode = %d\n", task_pid_nr(current),
-				task_pgrp_nr(current), sbi->catatonic,
-				oz_mode));
-
-	/*
-	 * Mark the dentry incomplete, but add it. This is needed so
-	 * that the VFS layer knows about the dentry, and we can count
-	 * on catching any lookups through the revalidate.
-	 *
-	 * Let all the hard work be done by the revalidate function that
-	 * needs to be able to do this anyway..
-	 *
-	 * We need to do this before we release the directory semaphore.
-	 */
-	dentry->d_op = &autofs_dentry_operations;
-	dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-	d_add(dentry, NULL);
-
-	mutex_unlock(&dir->i_mutex);
-	autofs_revalidate(dentry, nd);
-	mutex_lock(&dir->i_mutex);
-
-	/*
-	 * If we are still pending, check if we had to handle
-	 * a signal. If so we can force a restart..
-	 */
-	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-		/* See if we were interrupted */
-		if (signal_pending(current)) {
-			sigset_t *sigset = &current->pending.signal;
-			if (sigismember (sigset, SIGKILL) ||
-			    sigismember (sigset, SIGQUIT) ||
-			    sigismember (sigset, SIGINT)) {
-				unlock_kernel();
-				return ERR_PTR(-ERESTARTNOINTR);
-			}
-		}
-	}
-	unlock_kernel();
-
-	/*
-	 * If this dentry is unhashed, then we shouldn't honour this
-	 * lookup even if the dentry is positive.  Returning ENOENT here
-	 * doesn't do the right thing for all system calls, but it should
-	 * be OK for the operations we permit from an autofs.
-	 */
-	if (dentry->d_inode && d_unhashed(dentry))
-		return ERR_PTR(-ENOENT);
-
-	return NULL;
-}
-
-static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned int n;
-	int slsize;
-	struct autofs_symlink *sl;
-	struct inode *inode;
-
-	DPRINTK(("autofs_root_symlink: %s <- ", symname));
-	autofs_say(dentry->d_name.name,dentry->d_name.len);
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	if (autofs_hash_lookup(dh, &dentry->d_name)) {
-		unlock_kernel();
-		return -EEXIST;
-	}
-
-	n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-	if (n >= AUTOFS_MAX_SYMLINKS) {
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	set_bit(n,sbi->symlink_bitmap);
-	sl = &sbi->symlink[n];
-	sl->len = strlen(symname);
-	sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-	if (!sl->data) {
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if (!ent) {
-		kfree(sl->data);
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if (!ent->name) {
-		kfree(sl->data);
-		kfree(ent);
-		clear_bit(n,sbi->symlink_bitmap);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	memcpy(sl->data,symname,slsize);
-	sl->mtime = get_seconds();
-
-	ent->ino = AUTOFS_FIRST_SYMLINK + n;
-	ent->hash = dentry->d_name.hash;
-	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-	ent->dentry = NULL;	/* We don't keep the dentry for symlinks */
-
-	autofs_hash_insert(dh,ent);
-
-	inode = autofs_iget(dir->i_sb, ent->ino);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	d_instantiate(dentry, inode);
-	unlock_kernel();
-	return 0;
-}
-
-/*
- * NOTE!
- *
- * Normal filesystems would do a "d_delete()" to tell the VFS dcache
- * that the file no longer exists. However, doing that means that the
- * VFS layer can turn the dentry into a negative dentry, which we
- * obviously do not want (we're dropping the entry not because it
- * doesn't exist, but because it has timed out).
- *
- * Also see autofs_root_rmdir()..
- */
-static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	unsigned int n;
-
-	/* This allows root to remove symlinks */
-	lock_kernel();
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOENT;
-	}
-
-	n = ent->ino - AUTOFS_FIRST_SYMLINK;
-	if (n >= AUTOFS_MAX_SYMLINKS) {
-		unlock_kernel();
-		return -EISDIR;	/* It's a directory, dummy */
-	}
-	if (!test_bit(n,sbi->symlink_bitmap)) {
-		unlock_kernel();
-		return -EINVAL;	/* Nonexistent symlink?  Shouldn't happen */
-	}
-	
-	dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
-	autofs_hash_delete(ent);
-	clear_bit(n,sbi->symlink_bitmap);
-	kfree(sbi->symlink[n].data);
-	d_drop(dentry);
-	
-	unlock_kernel();
-	return 0;
-}
-
-static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOENT;
-	}
-
-	if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
-		unlock_kernel();
-		return -ENOTDIR; /* Not a directory */
-	}
-
-	if (ent->dentry != dentry) {
-		printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
-	}
-
-	dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
-	autofs_hash_delete(ent);
-	drop_nlink(dir);
-	d_drop(dentry);
-	unlock_kernel();
-
-	return 0;
-}
-
-static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-	struct autofs_dirhash *dh = &sbi->dirhash;
-	struct autofs_dir_ent *ent;
-	struct inode *inode;
-	ino_t ino;
-
-	lock_kernel();
-	if (!autofs_oz_mode(sbi)) {
-		unlock_kernel();
-		return -EACCES;
-	}
-
-	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if (ent) {
-		unlock_kernel();
-		return -EEXIST;
-	}
-
-	if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
-		printk("autofs: Out of inode numbers -- what the heck did you do??\n");
-		unlock_kernel();
-		return -ENOSPC;
-	}
-	ino = sbi->next_dir_ino++;
-
-	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if (!ent) {
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if (!ent->name) {
-		kfree(ent);
-		unlock_kernel();
-		return -ENOSPC;
-	}
-
-	ent->hash = dentry->d_name.hash;
-	memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-	ent->ino = ino;
-	ent->dentry = dentry;
-	autofs_hash_insert(dh,ent);
-
-	inc_nlink(dir);
-
-	inode = autofs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode)) {
-		drop_nlink(dir);
-		return PTR_ERR(inode);
-	}
-
-	d_instantiate(dentry, inode);
-	unlock_kernel();
-
-	return 0;
-}
-
-/* Get/set timeout ioctl() operation */
-static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
-					 unsigned long __user *p)
-{
-	unsigned long ntimeout;
-
-	if (get_user(ntimeout, p) ||
-	    put_user(sbi->exp_timeout / HZ, p))
-		return -EFAULT;
-
-	if (ntimeout > ULONG_MAX/HZ)
-		sbi->exp_timeout = 0;
-	else
-		sbi->exp_timeout = ntimeout * HZ;
-
-	return 0;
-}
-
-/* Return protocol version */
-static inline int autofs_get_protover(int __user *p)
-{
-	return put_user(AUTOFS_PROTO_VERSION, p);
-}
-
-/* Perform an expiry operation */
-static inline int autofs_expire_run(struct super_block *sb,
-				    struct autofs_sb_info *sbi,
-				    struct vfsmount *mnt,
-				    struct autofs_packet_expire __user *pkt_p)
-{
-	struct autofs_dir_ent *ent;
-	struct autofs_packet_expire pkt;
-
-	memset(&pkt,0,sizeof pkt);
-
-	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-	pkt.hdr.type = autofs_ptype_expire;
-
-	if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
-		return -EAGAIN;
-
-	pkt.len = ent->len;
-	memcpy(pkt.name, ent->name, pkt.len);
-	pkt.name[pkt.len] = '\0';
-
-	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
-		return -EFAULT;
-
-	return 0;
-}
-
-/*
- * ioctl()'s on the root directory is the chief method for the daemon to
- * generate kernel reactions
- */
-static int autofs_root_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
-{
-	struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
-	void __user *argp = (void __user *)arg;
-
-	DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
-
-	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
-		return -ENOTTY;
-	
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	
-	switch(cmd) {
-	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
-		return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
-	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
-		return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
-	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
-		autofs_catatonic_mode(sbi);
-		return 0;
-	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
-		return autofs_get_protover(argp);
-	case AUTOFS_IOC_SETTIMEOUT:
-		return autofs_get_set_timeout(sbi, argp);
-	case AUTOFS_IOC_EXPIRE:
-		return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
-					 argp);
-	default:
-		return -ENOSYS;
-	}
-}
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce..00000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include "autofs_i.h"
-
-/* Nothing to release.. */
-static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
-	nd_set_link(nd, s);
-	return NULL;
-}
-
-const struct inode_operations autofs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= autofs_follow_link
-};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f..00000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
-#include <linux/file.h>
-#include "autofs_i.h"
-
-/* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
-static autofs_wqt_t autofs_next_wait_queue = 1;
-
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-
-void autofs_catatonic_mode(struct autofs_sb_info *sbi)
-{
-	struct autofs_wait_queue *wq, *nwq;
-
-	DPRINTK(("autofs: entering catatonic mode\n"));
-
-	sbi->catatonic = 1;
-	wq = sbi->queues;
-	sbi->queues = NULL;	/* Erase all wait queues */
-	while ( wq ) {
-		nwq = wq->next;
-		wq->status = -ENOENT; /* Magic is gone - report failure */
-		kfree(wq->name);
-		wq->name = NULL;
-		wake_up(&wq->queue);
-		wq = nwq;
-	}
-	fput(sbi->pipe);	/* Close the pipe */
-	sbi->pipe = NULL;
-	autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
-}
-
-static int autofs_write(struct file *file, const void *addr, int bytes)
-{
-	unsigned long sigpipe, flags;
-	mm_segment_t fs;
-	const char *data = (const char *)addr;
-	ssize_t wr = 0;
-
-	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
-	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
-
-	/* Save pointer to user space and point back to kernel space */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	while (bytes &&
-	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
-		data += wr;
-		bytes -= wr;
-	}
-
-	set_fs(fs);
-
-	/* Keep the currently executing process from receiving a
-	   SIGPIPE unless it was already supposed to get one */
-	if (wr == -EPIPE && !sigpipe) {
-		spin_lock_irqsave(&current->sighand->siglock, flags);
-		sigdelset(&current->pending.signal, SIGPIPE);
-		recalc_sigpending();
-		spin_unlock_irqrestore(&current->sighand->siglock, flags);
-	}
-
-	return (bytes > 0);
-}
-	
-static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
-{
-	struct autofs_packet_missing pkt;
-
-	DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
-	autofs_say(wq->name,wq->len);
-
-	memset(&pkt,0,sizeof pkt); /* For security reasons */
-
-	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-	pkt.hdr.type = autofs_ptype_missing;
-	pkt.wait_queue_token = wq->wait_queue_token;
-	pkt.len = wq->len;
-        memcpy(pkt.name, wq->name, pkt.len);
-	pkt.name[pkt.len] = '\0';
-
-	if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
-		autofs_catatonic_mode(sbi);
-}
-
-int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
-{
-	struct autofs_wait_queue *wq;
-	int status;
-
-	/* In catatonic mode, we don't wait for nobody */
-	if ( sbi->catatonic )
-		return -ENOENT;
-	
-	/* We shouldn't be able to get here, but just in case */
-	if ( name->len > NAME_MAX )
-		return -ENOENT;
-
-	for ( wq = sbi->queues ; wq ; wq = wq->next ) {
-		if ( wq->hash == name->hash &&
-		     wq->len == name->len &&
-		     wq->name && !memcmp(wq->name,name->name,name->len) )
-			break;
-	}
-	
-	if ( !wq ) {
-		/* Create a new wait queue */
-		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
-		if ( !wq )
-			return -ENOMEM;
-
-		wq->name = kmalloc(name->len,GFP_KERNEL);
-		if ( !wq->name ) {
-			kfree(wq);
-			return -ENOMEM;
-		}
-		wq->wait_queue_token = autofs_next_wait_queue++;
-		init_waitqueue_head(&wq->queue);
-		wq->hash = name->hash;
-		wq->len = name->len;
-		wq->status = -EINTR; /* Status return if interrupted */
-		memcpy(wq->name, name->name, name->len);
-		wq->next = sbi->queues;
-		sbi->queues = wq;
-
-		/* autofs_notify_daemon() may block */
-		wq->wait_ctr = 2;
-		autofs_notify_daemon(sbi,wq);
-	} else
-		wq->wait_ctr++;
-
-	/* wq->name is NULL if and only if the lock is already released */
-
-	if ( sbi->catatonic ) {
-		/* We might have slept, so check again for catatonic mode */
-		wq->status = -ENOENT;
-		kfree(wq->name);
-		wq->name = NULL;
-	}
-
-	if ( wq->name ) {
-		/* Block all but "shutdown" signals while waiting */
-		sigset_t sigmask;
-
-		siginitsetinv(&sigmask, SHUTDOWN_SIGS);
-		sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
-
-		interruptible_sleep_on(&wq->queue);
-
-		sigprocmask(SIG_SETMASK, &sigmask, NULL);
-	} else {
-		DPRINTK(("autofs_wait: skipped sleeping\n"));
-	}
-
-	status = wq->status;
-
-	if ( ! --wq->wait_ctr )	/* Are we the last process to need status? */
-		kfree(wq);
-
-	return status;
-}
-
-
-int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
-{
-	struct autofs_wait_queue *wq, **wql;
-
-	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-		if ( wq->wait_queue_token == wait_queue_token )
-			break;
-	}
-	if ( !wq )
-		return -EINVAL;
-
-	*wql = wq->next;	/* Unlink from chain */
-	kfree(wq->name);
-	wq->name = NULL;	/* Do not wait on this queue */
-
-	wq->status = status;
-
-	if ( ! --wq->wait_ctr )	/* Is anyone still waiting for this guy? */
-		kfree(wq);
-	else
-		wake_up(&wq->queue);
-
-	return 0;
-}
-
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22..eff9a419469 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
 	.unlocked_ioctl	 = autofs_dev_ioctl,
 	.compat_ioctl = autofs_dev_ioctl_compat,
 	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd895..c038727b405 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static int autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, autofs4_fill_super);
 }
 
 static struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
-	.get_sb		= autofs_get_sb,
+	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
 };
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955da..ac87e49fa70 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 		inode->i_gid = sb->s_root->d_inode->i_gid;
 	}
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(inf->mode)) {
 		inode->i_nlink = 2;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index db4117ed780..d5c1401f003 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,7 +18,9 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+
 #include "autofs_i.h"
 
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
@@ -26,6 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -40,6 +45,9 @@ const struct file_operations autofs4_root_operations = {
 	.readdir	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 	.unlocked_ioctl	= autofs4_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs4_root_compat_ioctl,
+#endif
 };
 
 const struct file_operations autofs4_dir_operations = {
@@ -198,8 +206,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	}
 
 	/* Initialize expiry counter after successful mount */
-	if (ino)
-		ino->last_used = jiffies;
+	ino->last_used = jiffies;
 
 	spin_lock(&sbi->fs_lock);
 	ino->flags &= ~AUTOFS_INF_PENDING;
@@ -840,6 +847,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 compat_ulong_t __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
 static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
 					 unsigned long __user *p)
 {
@@ -933,6 +960,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 		return autofs4_get_protosubver(sbi, p);
 	case AUTOFS_IOC_SETTIMEOUT:
 		return autofs4_get_set_timeout(sbi, p);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs4_compat_get_set_timeout(sbi, p);
+#endif
 
 	case AUTOFS_IOC_ASKUMOUNT:
 		return autofs4_ask_umount(filp->f_path.mnt, p);
@@ -949,15 +980,36 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	}
 }
 
+static DEFINE_MUTEX(autofs4_ioctl_mutex);
+
 static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
 	long ret;
 	struct inode *inode = filp->f_dentry->d_inode;
 
-	lock_kernel();
+	mutex_lock(&autofs4_ioctl_mutex);
 	ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-	unlock_kernel();
+	mutex_unlock(&autofs4_ioctl_mutex);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret;
+
+	mutex_lock(&autofs4_ioctl_mutex);
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	else
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+	mutex_unlock(&autofs4_ioctl_mutex);
 
 	return ret;
 }
+#endif
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 52e59bf4aa5..f024d8aadde 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
 	return POLLERR;
 }
 
-static int bad_file_ioctl (struct inode *inode, struct file *filp,
-			unsigned int cmd, unsigned long arg)
-{
-	return -EIO;
-}
-
 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
 			unsigned long arg)
 {
@@ -159,7 +153,6 @@ static const struct file_operations bad_file_ops =
 	.aio_write	= bad_file_aio_write,
 	.readdir	= bad_file_readdir,
 	.poll		= bad_file_poll,
-	.ioctl		= bad_file_ioctl,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,
 	.mmap		= bad_file_mmap,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e6..aa4e7c7ae3c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
-		       "Couldn't initalize inode slabcache\n");
+		       "Couldn't initialize inode slabcache\n");
 		return -ENOMEM;
 	}
 
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int
-befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data, struct vfsmount *mnt)
+static struct dentry *
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
+	    void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
 }
 
 static struct file_system_type befs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "befs",
-	.get_sb		= befs_get_sb,
+	.mount		= befs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,	
 };
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 7109e451abf..f7f87e233dd 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -17,7 +17,6 @@ struct bfs_sb_info {
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
 	unsigned long *si_imap;
-	struct buffer_head *si_sbh;		/* buffer header w/superblock */
 	struct mutex bfs_lock;
 };
 
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b77..685ecff3ab3 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(new, inode);
 	mutex_unlock(&info->bfs_lock);
 	return 0;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3ff44e..eb67edd0f8e 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -70,7 +70,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 	struct super_block *sb = inode->i_sb;
 	struct bfs_sb_info *info = BFS_SB(sb);
 	struct bfs_inode_info *bi = BFS_I(inode);
-	struct buffer_head *sbh = info->si_sbh;
 
 	phys = bi->i_sblock + block;
 	if (!create) {
@@ -112,7 +111,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 		info->si_freeb -= phys - bi->i_eblock;
 		info->si_lf_eblk = bi->i_eblock = phys;
 		mark_inode_dirty(inode);
-		mark_buffer_dirty(sbh);
 		err = 0;
 		goto out;
 	}
@@ -147,7 +145,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 	 */
 	info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
 	mark_inode_dirty(inode);
-	mark_buffer_dirty(sbh);
 	map_bh(bh_result, sb, phys);
 out:
 	mutex_unlock(&info->bfs_lock);
@@ -168,9 +165,17 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags,
-					pagep, fsdata, bfs_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				bfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f22a7d3dc36..76db6d7d49b 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -31,7 +30,6 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
 
-static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -99,6 +97,24 @@ error:
 	return ERR_PTR(-EIO);
 }
 
+static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
+{
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
+		printf("Bad inode number %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	ino -= BFS_ROOT_INO;
+
+	*p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
+	if (!*p) {
+		printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
+		return ERR_PTR(-EIO);
+	}
+
+	return (struct bfs_inode *)(*p)->b_data +  ino % BFS_INODES_PER_BLOCK;
+}
+
 static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
@@ -106,28 +122,15 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
         unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
-	int block, off;
 	int err = 0;
 
         dprintf("ino=%08x\n", ino);
 
-	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
-		printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
-		return -EIO;
-	}
+	di = find_inode(inode->i_sb, ino, &bh);
+	if (IS_ERR(di))
+		return PTR_ERR(di);
 
 	mutex_lock(&info->bfs_lock);
-	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
-	bh = sb_bread(inode->i_sb, block);
-	if (!bh) {
-		printf("Unable to read inode %s:%08x\n",
-				inode->i_sb->s_id, ino);
-		mutex_unlock(&info->bfs_lock);
-		return -EIO;
-	}
-
-	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-	di = (struct bfs_inode *)bh->b_data + off;
 
 	if (ino == BFS_ROOT_INO)
 		di->i_vtype = cpu_to_le32(BFS_VDIR);
@@ -158,12 +161,11 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-static void bfs_delete_inode(struct inode *inode)
+static void bfs_evict_inode(struct inode *inode)
 {
 	unsigned long ino = inode->i_ino;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
-	int block, off;
 	struct super_block *s = inode->i_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	struct bfs_inode_info *bi = BFS_I(inode);
@@ -171,28 +173,19 @@ static void bfs_delete_inode(struct inode *inode)
 	dprintf("ino=%08lx\n", ino);
 
 	truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 
-	if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) {
-		printf("invalid ino=%08lx\n", ino);
+	if (inode->i_nlink)
 		return;
-	}
-	
-	inode->i_size = 0;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	mutex_lock(&info->bfs_lock);
-	mark_inode_dirty(inode);
 
-	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
-	bh = sb_bread(s, block);
-	if (!bh) {
-		printf("Unable to read inode %s:%08lx\n",
-					inode->i_sb->s_id, ino);
-		mutex_unlock(&info->bfs_lock);
+	di = find_inode(s, inode->i_ino, &bh);
+	if (IS_ERR(di))
 		return;
-	}
-	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-	di = (struct bfs_inode *)bh->b_data + off;
-	memset((void *)di, 0, sizeof(struct bfs_inode));
+
+	mutex_lock(&info->bfs_lock);
+	/* clear on-disk inode */
+	memset(di, 0, sizeof(struct bfs_inode));
 	mark_buffer_dirty(bh);
 	brelse(bh);
 
@@ -209,32 +202,9 @@ static void bfs_delete_inode(struct inode *inode)
 	 * "last block of the last file" even if there is no
 	 * real file there, saves us 1 gap.
 	 */
-	if (info->si_lf_eblk == bi->i_eblock) {
+	if (info->si_lf_eblk == bi->i_eblock)
 		info->si_lf_eblk = bi->i_sblock - 1;
-		mark_buffer_dirty(info->si_sbh);
-	}
-	mutex_unlock(&info->bfs_lock);
-	clear_inode(inode);
-}
-
-static int bfs_sync_fs(struct super_block *sb, int wait)
-{
-	struct bfs_sb_info *info = BFS_SB(sb);
-
-	mutex_lock(&info->bfs_lock);
-	mark_buffer_dirty(info->si_sbh);
-	sb->s_dirt = 0;
 	mutex_unlock(&info->bfs_lock);
-
-	return 0;
-}
-
-static void bfs_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		bfs_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
 }
 
 static void bfs_put_super(struct super_block *s)
@@ -244,18 +214,10 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
-	lock_kernel();
-
-	if (s->s_dirt)
-		bfs_write_super(s);
-
-	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -319,10 +281,8 @@ static const struct super_operations bfs_sops = {
 	.alloc_inode	= bfs_alloc_inode,
 	.destroy_inode	= bfs_destroy_inode,
 	.write_inode	= bfs_write_inode,
-	.delete_inode	= bfs_delete_inode,
+	.evict_inode	= bfs_evict_inode,
 	.put_super	= bfs_put_super,
-	.write_super	= bfs_write_super,
-	.sync_fs	= bfs_sync_fs,
 	.statfs		= bfs_statfs,
 };
 
@@ -349,7 +309,7 @@ void dump_imap(const char *prefix, struct super_block *s)
 
 static int bfs_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh, *sbh;
 	struct bfs_super_block *bfs_sb;
 	struct inode *inode;
 	unsigned i, imap_len;
@@ -365,10 +325,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 	sb_set_blocksize(s, BFS_BSIZE);
 
-	info->si_sbh = sb_bread(s, 0);
-	if (!info->si_sbh)
+	sbh = sb_bread(s, 0);
+	if (!sbh)
 		goto out;
-	bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
+	bfs_sb = (struct bfs_super_block *)sbh->b_data;
 	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
 		if (!silent)
 			printf("No BFS filesystem on %s (magic=%08x)\n", 
@@ -472,10 +432,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 			info->si_lf_eblk = eblock;
 	}
 	brelse(bh);
-	if (!(s->s_flags & MS_RDONLY)) {
-		mark_buffer_dirty(info->si_sbh);
-		s->s_dirt = 1;
-	} 
+	brelse(sbh);
 	dump_imap("read_super", s);
 	return 0;
 
@@ -485,7 +442,7 @@ out3:
 out2:
 	kfree(info->si_imap);
 out1:
-	brelse(info->si_sbh);
+	brelse(sbh);
 out:
 	mutex_destroy(&info->bfs_lock);
 	kfree(info);
@@ -493,16 +450,16 @@ out:
 	return ret;
 }
 
-static int bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
 }
 
 static struct file_system_type bfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bfs",
-	.get_sb		= bfs_get_sb,
+	.mount		= bfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11..a6395bdb26a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
 		if (!dump_write(file, dump_start, dump_size))
 			goto end_coredump;
 	}
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
-	set_fs(KERNEL_DS);
-	if (!dump_write(file, current, sizeof(*current)))
-		goto end_coredump;
 end_coredump:
 	set_fs(fs);
 	return has_dumped;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a..6884e198e0c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_ARM)
 			load_bias = 0;
 #else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead..1befe2ec818 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	Node *fmt;
 	struct file * interp_file = NULL;
 	char iname[BINPRM_BUF_SIZE];
-	char *iname_addr = iname;
+	const char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
 
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	struct inode * inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
@@ -502,8 +503,9 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
-static void bm_clear_inode(struct inode *inode)
+static void bm_evict_inode(struct inode *inode)
 {
+	end_writeback(inode);
 	kfree(inode->i_private);
 }
 
@@ -575,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 static const struct file_operations bm_entry_operations = {
 	.read		= bm_entry_read,
 	.write		= bm_entry_write,
+	.llseek		= default_llseek,
 };
 
 /* /register */
@@ -642,6 +645,7 @@ out:
 
 static const struct file_operations bm_register_operations = {
 	.write		= bm_register_write,
+	.llseek		= noop_llseek,
 };
 
 /* /status */
@@ -679,13 +683,14 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 static const struct file_operations bm_status_operations = {
 	.read		= bm_status_read,
 	.write		= bm_status_write,
+	.llseek		= default_llseek,
 };
 
 /* Superblock handling */
 
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
-	.clear_inode	= bm_clear_inode,
+	.evict_inode	= bm_evict_inode,
 };
 
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -701,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static int bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bm_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
+	return mount_single(fs_type, flags, data, bm_fill_super);
 }
 
 static struct linux_binfmt misc_format = {
@@ -715,7 +720,7 @@ static struct linux_binfmt misc_format = {
 static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
-	.get_sb		= bm_get_sb,
+	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
 };
 
@@ -723,7 +728,7 @@ static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
 	if (!err) {
-		err = register_binfmt(&misc_format);
+		err = insert_binfmt(&misc_format);
 		if (err)
 			unregister_filesystem(&bm_fs_type);
 	}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb2..396a9884591 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
 
 static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 {
-	char *cp, *i_name, *i_arg;
+	const char *i_arg, *i_name;
+	char *cp;
 	struct file *file;
 	char interp[BINPRM_BUF_SIZE];
 	int retval;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c..4d0ff5ee27b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
 
 	/* Allocate kernel buffer for protection data */
 	len = sectors * blk_integrity_tuple_size(bi);
-	buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
 	if (unlikely(buf == NULL)) {
 		printk(KERN_ERR "could not allocate integrity buffer\n");
-		return -EIO;
+		return -ENOMEM;
 	}
 
 	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/bio.c b/fs/bio.c
index e7bf6ca64dc..8abb2dfb2e7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	if (!bio)
 		goto out_bmd;
 
-	bio->bi_rw |= (!write_to_vm << BIO_RW);
+	if (!write_to_vm)
+		bio->bi_rw |= REQ_WRITE;
 
 	ret = 0;
 
@@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	 * set data direction, and check if mapped pages need bouncing
 	 */
 	if (!write_to_vm)
-		bio->bi_rw |= (1 << BIO_RW);
+		bio->bi_rw |= REQ_WRITE;
 
 	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99d6af81174..06e8ff12b97 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 
 EXPORT_SYMBOL(I_BDEV);
 
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+			struct backing_dev_info *dst)
+{
+	spin_lock(&inode_lock);
+	inode->i_data.backing_dev_info = dst;
+	if (inode->i_state & I_DIRTY)
+		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+	spin_unlock(&inode_lock);
+}
+
 static sector_t max_block(struct block_device *bdev)
 {
 	sector_t retval = ~((sector_t)0);
@@ -172,9 +187,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
-				I_BDEV(inode), iov, offset, nr_segs,
-				blkdev_get_blocks, NULL);
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
+				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,9 +323,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-				pagep, fsdata, blkdev_get_block);
+	return block_write_begin(mapping, pos, len, flags, pagep,
+				 blkdev_get_block);
 }
 
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -372,7 +385,7 @@ int blkdev_fsync(struct file *filp, int datasync)
 	 */
 	mutex_unlock(&bd_inode->i_mutex);
 
-	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
@@ -428,10 +441,13 @@ static inline void __bd_forget(struct inode *inode)
 	inode->i_mapping = &inode->i_data;
 }
 
-static void bdev_clear_inode(struct inode *inode)
+static void bdev_evict_inode(struct inode *inode)
 {
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
+	truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode); /* is it needed here? */
+	end_writeback(inode);
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 		__bd_forget(list_entry(p, struct inode, i_devices));
@@ -445,18 +461,18 @@ static const struct super_operations bdev_sops = {
 	.alloc_inode = bdev_alloc_inode,
 	.destroy_inode = bdev_destroy_inode,
 	.drop_inode = generic_delete_inode,
-	.clear_inode = bdev_clear_inode,
+	.evict_inode = bdev_evict_inode,
 };
 
-static int bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *bd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
+	return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
 }
 
 static struct file_system_type bd_type = {
 	.name		= "bdev",
-	.get_sb		= bd_get_sb,
+	.mount		= bd_mount,
 	.kill_sb	= kill_anon_super,
 };
 
@@ -549,7 +565,7 @@ EXPORT_SYMBOL(bdget);
  */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-	atomic_inc(&bdev->bd_inode->i_count);
+	ihold(bdev->bd_inode);
 	return bdev;
 }
 
@@ -579,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
-		atomic_inc(&bdev->bd_inode->i_count);
+		ihold(bdev->bd_inode);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
@@ -590,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
 		spin_lock(&bdev_lock);
 		if (!inode->i_bdev) {
 			/*
-			 * We take an additional bd_inode->i_count for inode,
+			 * We take an additional reference to bd_inode,
 			 * and it's released in clear_inode() of inode.
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
-			atomic_inc(&bdev->bd_inode->i_count);
+			ihold(bdev->bd_inode);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -681,8 +697,8 @@ retry:
 	if (!bd_may_claim(bdev, whole, holder))
 		return -EBUSY;
 
-	/* if someone else is claiming, wait for it to finish */
-	if (whole->bd_claiming && whole->bd_claiming != holder) {
+	/* if claiming is already in progress, wait for it to finish */
+	if (whole->bd_claiming) {
 		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
 		DEFINE_WAIT(wait);
 
@@ -1339,19 +1355,20 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
-	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-	if (ret != 0) {
-		bdput(bdev);
-		return ret;
+	if (!for_part) {
+		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+		if (ret != 0) {
+			bdput(bdev);
+			return ret;
+		}
 	}
 
-	lock_kernel();
  restart:
 
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
-		goto out_unlock_kernel;
+		goto out;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
@@ -1388,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
 					bdi = &default_backing_dev_info;
-				bdev->bd_inode->i_data.backing_dev_info = bdi;
+				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
@@ -1403,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
-			bdev->bd_inode->i_data.backing_dev_info =
-			   whole->bd_inode->i_data.backing_dev_info;
+			bdev_inode_switch_bdi(bdev->bd_inode,
+				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1431,22 +1448,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
-	unlock_kernel();
 	return 0;
 
  out_clear:
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
-	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
- out_unlock_kernel:
-	unlock_kernel();
-
+ out:
 	if (disk)
 		module_put(disk->fops->owner);
 	put_disk(disk);
@@ -1515,7 +1529,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	struct block_device *victim = NULL;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
-	lock_kernel();
 	if (for_part)
 		bdev->bd_part_count--;
 
@@ -1535,12 +1548,12 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
-		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+		bdev_inode_switch_bdi(bdev->bd_inode,
+					&default_backing_dev_info);
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 	}
-	unlock_kernel();
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a..7845d1f7d1d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
  */
 static void end_compressed_bio_read(struct bio *bio, int err)
 {
-	struct extent_io_tree *tree;
 	struct compressed_bio *cb = bio->bi_private;
 	struct inode *inode;
 	struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	tree = &BTRFS_I(inode)->io_tree;
 	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
 					cb->start,
 					cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc..9ac17159925 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
 {
 	struct extent_buffer *cow;
-	u32 nritems;
 	int ret = 0;
 	int level;
 	struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
-	nritems = btrfs_header_nritems(buf);
 	if (level == 0)
 		btrfs_item_key(buf, &disk_key, 0);
 	else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	int wret;
 	int pslot;
 	int orig_slot = path->slots[level];
-	int err_on_enospc = 0;
 	u64 orig_ptr;
 
 	if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (btrfs_header_nritems(mid) < 2)
-		err_on_enospc = 1;
+	btrfs_header_nritems(mid);
 
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = push_node_left(trans, root, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
-		if (btrfs_header_nritems(mid) < 2)
-			err_on_enospc = 1;
+		btrfs_header_nritems(mid);
 	}
 
 	/*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	int wret;
 	int pslot;
 	int orig_slot = path->slots[level];
-	u64 orig_ptr;
 
 	if (level == 0)
 		return 1;
 
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
-	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
 		parent = path->nodes[level + 1];
@@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	blocksize = btrfs_level_size(root, level - 1);
 
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-		/*
-		 * we found an up to date block without sleeping, return
-		 * right away
-		 */
-		*eb_ret = tmp;
-		return 0;
+	if (tmp) {
+		if (btrfs_buffer_uptodate(tmp, 0)) {
+			if (btrfs_buffer_uptodate(tmp, gen)) {
+				/*
+				 * we found an up to date block without
+				 * sleeping, return
+				 * right away
+				 */
+				*eb_ret = tmp;
+				return 0;
+			}
+			/* the pages were up to date, but we failed
+			 * the generation number check.  Do a full
+			 * read for the generation number that is correct.
+			 * We must do this without dropping locks so
+			 * we can trust our generation number
+			 */
+			free_extent_buffer(tmp);
+			tmp = read_tree_block(root, blocknr, blocksize, gen);
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				*eb_ret = tmp;
+				return 0;
+			}
+			free_extent_buffer(tmp);
+			btrfs_release_path(NULL, p);
+			return -EIO;
+		}
 	}
 
 	/*
@@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	btrfs_unlock_up_safe(p, level + 1);
 	btrfs_set_path_blocking(p);
 
-	if (tmp)
-		free_extent_buffer(tmp);
+	free_extent_buffer(tmp);
 	if (p->reada)
 		reada_for_search(root, p, level, slot, key->objectid);
 
@@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
-	int slot;
 	int i;
 	int push_space = 0;
 	int push_items = 0;
@@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	u32 this_item_size;
 	u32 old_left_item_size;
 
-	slot = path->slots[1];
-
 	if (empty)
 		nr = min(right_nritems, max_slot);
 	else
@@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	int slot;
-	int slot_orig;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
 	u32 nritems;
@@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	unsigned int size_diff;
 	int i;
 
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
@@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	int slot;
-	int slot_orig;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
 	u32 nritems;
@@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	unsigned int old_size;
 	int i;
 
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
 
 	nritems = btrfs_header_nritems(leaf);
@@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_key *cpu_key, u32 *data_size,
 			    int nr)
 {
-	struct extent_buffer *leaf;
 	int ret = 0;
 	int slot;
 	int i;
@@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 
-	leaf = path->nodes[0];
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 29c20092847..8db9234f6b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
 
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -265,6 +268,22 @@ struct btrfs_chunk {
 	/* additional stripes go here */
 } __attribute__ ((__packed__));
 
+#define BTRFS_FREE_SPACE_EXTENT	1
+#define BTRFS_FREE_SPACE_BITMAP	2
+
+struct btrfs_free_space_entry {
+	__le64 offset;
+	__le64 bytes;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+	struct btrfs_disk_key location;
+	__le64 generation;
+	__le64 num_entries;
+	__le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
 	BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
 
 	char label[BTRFS_LABEL_SIZE];
 
+	__le64 cache_generation;
+
 	/* future expansion */
-	__le64 reserved[32];
+	__le64 reserved[31];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -375,13 +396,15 @@ struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP		\
-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |	\
-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -675,7 +698,8 @@ struct btrfs_block_group_item {
 struct btrfs_space_info {
 	u64 flags;
 
-	u64 total_bytes;	/* total bytes in the space */
+	u64 total_bytes;	/* total bytes in the space,
+				   this doesn't take mirrors into account */
 	u64 bytes_used;		/* total bytes used,
 				   this does't take mirrors into account */
 	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
@@ -687,6 +711,8 @@ struct btrfs_space_info {
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
 	u64 disk_used;		/* total bytes used on disk */
+	u64 disk_total;		/* total bytes on disk, takes mirrors into
+				   account */
 
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
@@ -750,6 +776,14 @@ enum btrfs_caching_type {
 	BTRFS_CACHE_FINISHED	= 2,
 };
 
+enum btrfs_disk_cache_state {
+	BTRFS_DC_WRITTEN	= 0,
+	BTRFS_DC_ERROR		= 1,
+	BTRFS_DC_CLEAR		= 2,
+	BTRFS_DC_SETUP		= 3,
+	BTRFS_DC_NEED_WRITE	= 4,
+};
+
 struct btrfs_caching_control {
 	struct list_head list;
 	struct mutex mutex;
@@ -763,6 +797,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
@@ -773,8 +808,11 @@ struct btrfs_block_group_cache {
 	int extents_thresh;
 	int free_extents;
 	int total_bitmaps;
-	int ro;
-	int dirty;
+	int ro:1;
+	int dirty:1;
+	int iref:1;
+
+	int disk_cache_state;
 
 	/* cache tracking stuff */
 	int cached;
@@ -863,6 +901,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
+	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
 	struct btrfs_super_block super_copy;
@@ -949,6 +988,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
+	struct btrfs_workers endio_freespace_worker;
 	struct btrfs_workers submit_workers;
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
@@ -1192,6 +1232,9 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOSSD		(1 << 9)
 #define BTRFS_MOUNT_DISCARD		(1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
 	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
 
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+		   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+		   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+		   generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+					struct btrfs_free_space_header *h,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+					    struct btrfs_free_space_header *h,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
 /* struct btrfs_disk_key */
 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
 			 objectid, 64);
@@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
 			 csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+			 cache_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file)
 	return file->f_path.dentry;
 }
 
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
 /* extent-tree.c */
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				int num_items, int *retries);
+				int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries);
+			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
@@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2389,13 +2463,13 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 			      pgoff_t offset, pgoff_t last_index);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
-void btrfs_delete_inode(struct inode *inode);
+void btrfs_evict_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
-void btrfs_drop_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
@@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
 			      u64 start, u64 num_bytes, u64 min_size,
 			      loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa4..f0cad5ae5be 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		ret = btrfs_truncate_item(trans, root, path,
 					  item_len - sub_item_len, 1);
 	}
-	return 0;
+	return ret;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34f7c375567..fb827d0d718 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 found_start;
-	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
 	int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 		goto err;
 	}
-	found_level = btrfs_header_level(eb);
-
 	csum_tree_block(root, eb, 0);
 err:
 	free_extent_buffer(eb);
@@ -480,10 +477,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
 
-	if (bio->bi_rw & (1 << BIO_RW)) {
-		if (end_io_wq->metadata)
+	if (bio->bi_rw & REQ_WRITE) {
+		if (end_io_wq->metadata == 1)
 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 					   &end_io_wq->work);
+		else if (end_io_wq->metadata == 2)
+			btrfs_queue_worker(&fs_info->endio_freespace_worker,
+					   &end_io_wq->work);
 		else
 			btrfs_queue_worker(&fs_info->endio_write_workers,
 					   &end_io_wq->work);
@@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	}
 }
 
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
 {
@@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 
 static void run_one_async_start(struct btrfs_work *work)
 {
-	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	fs_info = BTRFS_I(async->inode)->root->fs_info;
 	async->submit_bio_start(async->inode, async->rw, async->bio,
 			       async->mirror_num, async->bio_flags,
 			       async->bio_offset);
@@ -604,7 +609,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	atomic_inc(&fs_info->nr_async_submits);
 
-	if (rw & (1 << BIO_RW_SYNCIO))
+	if (rw & REQ_SYNC)
 		btrfs_set_work_high_prio(&async->work);
 
 	btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -668,7 +673,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 					  bio, 1);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
@@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_io_tree *io_tree;
 	int ret;
 
-	io_tree = &BTRFS_I(btree_inode)->io_tree;
-
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
@@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
 	u64 start = 0;
 	struct page *page;
 	struct extent_io_tree *io_tree = NULL;
-	struct btrfs_fs_info *info = NULL;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
 		buf_len = page->private >> 2;
 		start = page_offset(page) + bvec->bv_offset;
 		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-		info = BTRFS_I(page->mapping->host)->root->fs_info;
 	}
 	/* are we fully contained in this bio? */
 	if (buf_len <= length)
@@ -1427,7 +1426,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	 * ram and up to date before trying to verify things.  For
 	 * blocksize <= pagesize, it is basically a noop
 	 */
-	if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+	if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
 	    !bio_ready_for_csum(bio)) {
 		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
@@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh)
 		goto fail_iput;
@@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+			   1, &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
 	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
 	btrfs_start_workers(&fs_info->endio_write_workers, 1);
+	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_read(&fs_info->cleanup_work_sem);
 		btrfs_orphan_cleanup(fs_info->fs_root);
+		btrfs_orphan_cleanup(fs_info->tree_root);
 		up_read(&fs_info->cleanup_work_sem);
 	}
 
@@ -2035,6 +2038,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2063,7 +2067,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (printk_ratelimit()) {
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
@@ -2200,21 +2204,10 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
 
-		if (i == last_barrier && do_barriers && device->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       device->name);
-				set_buffer_uptodate(bh);
-				device->barriers = 0;
-				/* one reference for submit_bh */
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE_SYNC, bh);
-			}
-		} else {
+		if (i == last_barrier && do_barriers)
+			ret = submit_bh(WRITE_FLUSH_FUA, bh);
+		else
 			ret = submit_bh(WRITE_SYNC, bh);
-		}
 
 		if (ret)
 			errors++;
@@ -2421,6 +2414,7 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	btrfs_put_block_group_cache(fs_info);
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
 		if (ret)
@@ -2467,6 +2461,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a5..0c097f3aec4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
 		return NULL;
 	}
 
+	/* We're loading it the fast way, so we don't have a caching_ctl. */
+	if (!cache->caching_ctl) {
+		spin_unlock(&cache->lock);
+		return NULL;
+	}
+
 	ctl = cache->caching_ctl;
 	atomic_inc(&ctl->count);
 	spin_unlock(&cache->lock);
@@ -421,7 +427,9 @@ err:
 	return 0;
 }
 
-static int cache_block_group(struct btrfs_block_group_cache *cache)
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+			     struct btrfs_trans_handle *trans,
+			     int load_cache_only)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_caching_control *caching_ctl;
@@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
 	if (cache->cached != BTRFS_CACHE_NO)
 		return 0;
 
+	/*
+	 * We can't do the read from on-disk cache during a commit since we need
+	 * to have the normal tree locking.
+	 */
+	if (!trans->transaction->in_commit) {
+		spin_lock(&cache->lock);
+		if (cache->cached != BTRFS_CACHE_NO) {
+			spin_unlock(&cache->lock);
+			return 0;
+		}
+		cache->cached = BTRFS_CACHE_STARTED;
+		spin_unlock(&cache->lock);
+
+		ret = load_free_space_cache(fs_info, cache);
+
+		spin_lock(&cache->lock);
+		if (ret == 1) {
+			cache->cached = BTRFS_CACHE_FINISHED;
+			cache->last_byte_to_unpin = (u64)-1;
+		} else {
+			cache->cached = BTRFS_CACHE_NO;
+		}
+		spin_unlock(&cache->lock);
+		if (ret == 1)
+			return 0;
+	}
+
+	if (load_cache_only)
+		return 0;
+
 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
 	BUG_ON(!caching_ctl);
 
@@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
-		if (found->flags == flags) {
+		if (found->flags & flags) {
 			rcu_read_unlock();
 			return found;
 		}
@@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
+static u64 div_factor_fine(u64 num, int factor)
+{
+	if (factor == 100)
+		return num;
+	num *= factor;
+	do_div(num, 100);
+	return num;
+}
+
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner)
 {
@@ -1695,8 +1742,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -2688,6 +2734,109 @@ next_block_group(struct btrfs_root *root,
 	return cache;
 }
 
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_path *path)
+{
+	struct btrfs_root *root = block_group->fs_info->tree_root;
+	struct inode *inode = NULL;
+	u64 alloc_hint = 0;
+	int num_pages = 0;
+	int retries = 0;
+	int ret = 0;
+
+	/*
+	 * If this block group is smaller than 100 megs don't bother caching the
+	 * block group.
+	 */
+	if (block_group->key.offset < (100 * 1024 * 1024)) {
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+
+again:
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+		ret = PTR_ERR(inode);
+		btrfs_release_path(root, path);
+		goto out;
+	}
+
+	if (IS_ERR(inode)) {
+		BUG_ON(retries);
+		retries++;
+
+		if (block_group->ro)
+			goto out_free;
+
+		ret = create_free_space_inode(root, trans, block_group, path);
+		if (ret)
+			goto out_free;
+		goto again;
+	}
+
+	/*
+	 * We want to set the generation to 0, that way if anything goes wrong
+	 * from here on out we know not to trust this cache when we load up next
+	 * time.
+	 */
+	BTRFS_I(inode)->generation = 0;
+	ret = btrfs_update_inode(trans, root, inode);
+	WARN_ON(ret);
+
+	if (i_size_read(inode) > 0) {
+		ret = btrfs_truncate_free_space_cache(root, trans, path,
+						      inode);
+		if (ret)
+			goto out_put;
+	}
+
+	spin_lock(&block_group->lock);
+	if (block_group->cached != BTRFS_CACHE_FINISHED) {
+		spin_unlock(&block_group->lock);
+		goto out_put;
+	}
+	spin_unlock(&block_group->lock);
+
+	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	if (!num_pages)
+		num_pages = 1;
+
+	/*
+	 * Just to make absolutely sure we have enough space, we're going to
+	 * preallocate 12 pages worth of space for each block group.  In
+	 * practice we ought to use at most 8, but we need extra space so we can
+	 * add our header and have a terminator between the extents and the
+	 * bitmaps.
+	 */
+	num_pages *= 16;
+	num_pages *= PAGE_CACHE_SIZE;
+
+	ret = btrfs_check_data_free_space(inode, num_pages);
+	if (ret)
+		goto out_put;
+
+	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+					      num_pages, num_pages,
+					      &alloc_hint);
+	btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+	iput(inode);
+out_free:
+	btrfs_release_path(root, path);
+out:
+	spin_lock(&block_group->lock);
+	if (ret)
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+	else
+		block_group->disk_cache_state = BTRFS_DC_SETUP;
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
@@ -2700,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+again:
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+		err = cache_save_setup(cache, trans, path);
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
 	while (1) {
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 		while (cache) {
+			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+
 			if (cache->dirty)
 				break;
 			cache = next_block_group(root, cache);
@@ -2720,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		if (cache->disk_cache_state == BTRFS_DC_SETUP)
+			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
 		cache->dirty = 0;
 		last = cache->key.objectid + cache->key.offset;
 
@@ -2728,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		btrfs_put_block_group(cache);
 	}
 
+	while (1) {
+		/*
+		 * I don't think this is needed since we're just marking our
+		 * preallocated extent as written, but just in case it can't
+		 * hurt.
+		 */
+		if (last == 0) {
+			err = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long)-1);
+			BUG_ON(err);
+		}
+
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			/*
+			 * Really this shouldn't happen, but it could if we
+			 * couldn't write the entire preallocated extent and
+			 * splitting the extent resulted in a new block.
+			 */
+			if (cache->dirty) {
+				btrfs_put_block_group(cache);
+				goto again;
+			}
+			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		btrfs_write_out_cache(root, trans, cache, path);
+
+		/*
+		 * If we didn't have an error then the cache state is still
+		 * NEED_WRITE, so we can set it to WRITTEN.
+		 */
+		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+			cache->disk_cache_state = BTRFS_DC_WRITTEN;
+		last = cache->key.objectid + cache->key.offset;
+		btrfs_put_block_group(cache);
+	}
+
 	btrfs_free_path(path);
 	return 0;
 }
@@ -2763,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (found) {
 		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
+		found->disk_total += total_bytes * factor;
 		found->bytes_used += bytes_used;
 		found->disk_used += bytes_used * factor;
 		found->full = 0;
@@ -2782,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 				BTRFS_BLOCK_GROUP_SYSTEM |
 				BTRFS_BLOCK_GROUP_METADATA);
 	found->total_bytes = total_bytes;
+	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
 	found->disk_used = bytes_used * factor;
 	found->bytes_pinned = 0;
@@ -2883,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 used;
-	int ret = 0, committed = 0;
+	int ret = 0, committed = 0, alloc_chunk = 1;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
+	if (root == root->fs_info->tree_root) {
+		alloc_chunk = 0;
+		committed = 1;
+	}
+
 	data_sinfo = BTRFS_I(inode)->space_info;
 	if (!data_sinfo)
 		goto alloc;
@@ -2906,7 +3134,7 @@ again:
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
-		if (!data_sinfo->full) {
+		if (!data_sinfo->full && alloc_chunk) {
 			u64 alloc_target;
 
 			data_sinfo->force_alloc = 1;
@@ -2998,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
-			      u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_root *root,
+			      struct btrfs_space_info *sinfo, u64 alloc_bytes)
 {
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+	u64 thresh;
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved +
 	    alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3011,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
 	    alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
+	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+
+	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+		return 0;
+
 	return 1;
 }
 
@@ -3042,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+	if (!force && !should_alloc_chunk(extent_root, space_info,
+					  alloc_bytes)) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
 	spin_unlock(&space_info->lock);
 
 	/*
+	 * If we have mixed data/metadata chunks we want to make sure we keep
+	 * allocating mixed chunks instead of individual chunks.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+	/*
 	 * if we're doing a data chunk, go ahead and make sure that
 	 * we keep a reasonable number of metadata chunks allocated in the
 	 * FS as well.
@@ -3073,55 +3316,25 @@ out:
 	return ret;
 }
 
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-	int ret;
-	int end_trans = 0;
-
-	if (sinfo->full)
-		return 0;
-
-	spin_lock(&sinfo->lock);
-	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-	spin_unlock(&sinfo->lock);
-	if (!ret)
-		return 0;
-
-	if (!trans) {
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		end_trans = 1;
-	}
-
-	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-			     num_bytes + 2 * 1024 * 1024,
-			     get_alloc_profile(root, sinfo->flags), 0);
-
-	if (end_trans)
-		btrfs_end_transaction(trans, root);
-
-	return ret == 1 ? 1 : 0;
-}
-
 /*
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim)
+			   struct btrfs_root *root, u64 to_reclaim, int sync)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	int pause = 1;
-	int ret;
+	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
-	spin_lock(&block_rsv->lock);
-	reserved = block_rsv->reserved;
-	spin_unlock(&block_rsv->lock);
+	space_info = block_rsv->space_info;
+
+	smp_mb();
+	reserved = space_info->bytes_reserved;
 
 	if (reserved == 0)
 		return 0;
@@ -3129,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	max_reclaim = min(reserved, to_reclaim);
 
 	while (1) {
-		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
-		if (!ret) {
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(pause);
-			pause <<= 1;
-			if (pause > HZ / 10)
-				pause = HZ / 10;
-		} else {
-			pause = 1;
-		}
+		/* have the flusher threads jump in and do some IO */
+		smp_mb();
+		nr_pages = min_t(unsigned long, nr_pages,
+		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
-		spin_lock(&block_rsv->lock);
-		if (reserved > block_rsv->reserved)
-			reclaimed = reserved - block_rsv->reserved;
-		reserved = block_rsv->reserved;
-		spin_unlock(&block_rsv->lock);
+		spin_lock(&space_info->lock);
+		if (reserved > space_info->bytes_reserved)
+			reclaimed += reserved - space_info->bytes_reserved;
+		reserved = space_info->bytes_reserved;
+		spin_unlock(&space_info->lock);
 
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
 
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(pause);
+		pause <<= 1;
+		if (pause > HZ / 10)
+			pause = HZ / 10;
+
 	}
 	return reclaimed >= to_reclaim;
 }
 
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	int ret;
+	u64 unused;
+	u64 num_bytes = orig_bytes;
+	int retries = 0;
+	int ret = 0;
+	bool reserved = false;
+	bool committed = false;
 
-	if ((*retries) > 2)
-		return -ENOSPC;
+again:
+	ret = -ENOSPC;
+	if (reserved)
+		num_bytes = 0;
 
-	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-	if (ret)
-		return 1;
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
-	if (trans && trans->transaction->in_commit)
-		return -ENOSPC;
+	/*
+	 * The idea here is that we've not already over-reserved the block group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we've already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (unused <= space_info->total_bytes) {
+		unused -= space_info->total_bytes;
+		if (unused >= num_bytes) {
+			if (!reserved)
+				space_info->bytes_reserved += orig_bytes;
+			ret = 0;
+		} else {
+			/*
+			 * Ok set num_bytes to orig_bytes since we aren't
+			 * overocmmitted, this way we only try and reclaim what
+			 * we need.
+			 */
+			num_bytes = orig_bytes;
+		}
+	} else {
+		/*
+		 * Ok we're over committed, set num_bytes to the overcommitted
+		 * amount plus the amount of bytes that we need for this
+		 * reservation.
+		 */
+		num_bytes = unused - space_info->total_bytes +
+			(orig_bytes * (retries + 1));
+	}
 
-	ret = shrink_delalloc(trans, root, num_bytes);
-	if (ret)
-		return ret;
+	/*
+	 * Couldn't make our reservation, save our place so while we're trying
+	 * to reclaim space we can actually use it instead of somebody else
+	 * stealing it from us.
+	 */
+	if (ret && !reserved) {
+		space_info->bytes_reserved += orig_bytes;
+		reserved = true;
+	}
 
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < num_bytes)
-		ret = 1;
 	spin_unlock(&space_info->lock);
-	if (ret)
-		return -ENOSPC;
-
-	(*retries)++;
 
-	if (trans)
-		return -EAGAIN;
+	if (!ret)
+		return 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	if (!flush)
+		goto out;
 
-	return 1;
-}
+	/*
+	 * We do synchronous shrinking since we don't actually unreserve
+	 * metadata until after the IO is completed.
+	 */
+	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	if (ret > 0)
+		return 0;
+	else if (ret < 0)
+		goto out;
 
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes)
-{
-	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 unused;
-	int ret = -ENOSPC;
+	/*
+	 * So if we were overcommitted it's possible that somebody else flushed
+	 * out enough space and we simply didn't have enough space to reclaim,
+	 * so go back around and try again.
+	 */
+	if (retries < 2) {
+		retries++;
+		goto again;
+	}
 
 	spin_lock(&space_info->lock);
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly;
+	/*
+	 * Not enough space to be reclaimed, don't bother committing the
+	 * transaction.
+	 */
+	if (space_info->bytes_pinned < orig_bytes)
+		ret = -ENOSPC;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		goto out;
 
-	if (unused < space_info->total_bytes)
-		unused = space_info->total_bytes - unused;
-	else
-		unused = 0;
+	ret = -EAGAIN;
+	if (trans || committed)
+		goto out;
 
-	if (unused >= num_bytes) {
-		if (block_rsv->priority >= 10) {
-			space_info->bytes_reserved += num_bytes;
-			ret = 0;
-		} else {
-			if ((unused + block_rsv->reserved) *
-			    block_rsv->priority >=
-			    (num_bytes + block_rsv->reserved) * 10) {
-				space_info->bytes_reserved += num_bytes;
-				ret = 0;
-			}
-		}
+	ret = -ENOSPC;
+	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		goto out;
+	ret = btrfs_commit_transaction(trans, root);
+	if (!ret) {
+		trans = NULL;
+		committed = true;
+		goto again;
+	}
+
+out:
+	if (reserved) {
+		spin_lock(&space_info->lock);
+		space_info->bytes_reserved -= orig_bytes;
+		spin_unlock(&space_info->lock);
 	}
-	spin_unlock(&space_info->lock);
 
 	return ret;
 }
@@ -3328,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 alloc_target;
 
 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
 	if (!block_rsv)
 		return NULL;
 
 	btrfs_init_block_rsv(block_rsv);
-
-	alloc_target = btrfs_get_alloc_profile(root, 0);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
-
 	return block_rsv;
 }
 
@@ -3370,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries)
+			u64 num_bytes)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
-again:
-	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
 	}
 
-	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-	if (ret > 0)
-		goto again;
-
 	return ret;
 }
 
@@ -3421,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 
 	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     num_bytes, 0);
 		if (!ret) {
 			block_rsv_add_bytes(block_rsv, num_bytes, 0);
 			return 0;
@@ -3500,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	spin_lock(&sinfo->lock);
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+		data_used = 0;
 	meta_used = sinfo->bytes_used;
 	spin_unlock(&sinfo->lock);
 
@@ -3527,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	block_rsv->size = num_bytes;
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-		    sinfo->bytes_reserved + sinfo->bytes_readonly;
+		    sinfo->bytes_reserved + sinfo->bytes_readonly +
+		    sinfo->bytes_may_use;
 
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
@@ -3598,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
 
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 int num_items, int *retries)
+				 int num_items)
 {
 	u64 num_bytes;
 	int ret;
@@ -3608,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 
 	num_bytes = calc_trans_metadata_size(root, num_items);
 	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-				  num_bytes, retries);
+				  num_bytes);
 	if (!ret) {
 		trans->bytes_reserved += num_bytes;
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3682,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve;
 	int nr_extents;
-	int retries = 0;
 	int ret;
 
 	if (btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
 	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3699,18 +3972,14 @@ again:
 		nr_extents = 0;
 		to_reserve = 0;
 	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
-	ret = reserve_metadata_bytes(block_rsv, to_reserve);
-	if (ret) {
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-					   &retries);
-		if (ret > 0)
-			goto again;
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	if (ret)
 		return ret;
-	}
 
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3718,7 +3987,7 @@ again:
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	if (block_rsv->size > 512 * 1024 * 1024)
-		shrink_delalloc(NULL, root, to_reserve);
+		shrink_delalloc(NULL, root, to_reserve, 0);
 
 	return 0;
 }
@@ -3777,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc)
 {
-	struct btrfs_block_group_cache *cache;
+	struct btrfs_block_group_cache *cache = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
-	int factor;
 	u64 total = num_bytes;
 	u64 old_val;
 	u64 byte_in_group;
+	int factor;
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
@@ -3804,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			factor = 2;
 		else
 			factor = 1;
+		/*
+		 * If this block group has free space cache written out, we
+		 * need to make sure to load it if we are removing space.  This
+		 * is because we need the unpinning stage to actually add the
+		 * space back to the block group, otherwise we will leak space.
+		 */
+		if (!alloc && cache->cached == BTRFS_CACHE_NO)
+			cache_block_group(cache, trans, 1);
+
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
+
+		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		    cache->disk_cache_state < BTRFS_DC_CLEAR)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
 		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -4555,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
+	bool use_cluster = true;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -4569,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 	}
 
+	/*
+	 * If the space info is for both data and metadata it means we have a
+	 * small filesystem and we can't use the clustering stuff.
+	 */
+	if (btrfs_mixed_space_info(space_info))
+		use_cluster = false;
+
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
 
-	if (data & BTRFS_BLOCK_GROUP_METADATA) {
+	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
 			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	    btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->data_alloc_cluster;
 	}
 
@@ -4642,6 +4934,10 @@ have_block_group:
 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
 			u64 free_percent;
 
+			ret = cache_block_group(block_group, trans, 1);
+			if (block_group->cached == BTRFS_CACHE_FINISHED)
+				goto have_block_group;
+
 			free_percent = btrfs_block_group_used(&block_group->item);
 			free_percent *= 100;
 			free_percent = div64_u64(free_percent,
@@ -4662,7 +4958,7 @@ have_block_group:
 			if (loop > LOOP_CACHING_NOWAIT ||
 			    (loop > LOOP_FIND_IDEAL &&
 			     atomic_read(&space_info->caching_threads) < 2)) {
-				ret = cache_block_group(block_group);
+				ret = cache_block_group(block_group, trans, 0);
 				BUG_ON(ret);
 			}
 			found_uncached_bg = true;
@@ -5219,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	u64 num_bytes = ins->offset;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group);
+	cache_block_group(block_group, trans, 0);
 	caching_ctl = get_caching_control(block_group);
 
 	if (!caching_ctl) {
@@ -5309,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(block_rsv, blocksize);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     blocksize, 0);
 		if (ret)
 			return ERR_PTR(ret);
 		return block_rsv;
@@ -5319,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return block_rsv;
 
-	WARN_ON(1);
-	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-		block_rsv->size, block_rsv->reserved,
-		block_rsv->freed[0], block_rsv->freed[1]);
-
 	return ERR_PTR(-ENOSPC);
 }
 
@@ -5422,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 	u64 generation;
 	u64 refs;
 	u64 flags;
-	u64 last = 0;
 	u32 nritems;
 	u32 blocksize;
 	struct btrfs_key key;
@@ -5490,7 +5781,6 @@ reada:
 					   generation);
 		if (ret)
 			break;
-		last = bytenr + blocksize;
 		nread++;
 	}
 	wc->reada_slot = slot;
@@ -7814,6 +8104,40 @@ out:
 	return ret;
 }
 
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	u64 last = 0;
+
+	while (1) {
+		struct inode *inode;
+
+		block_group = btrfs_lookup_first_block_group(info, last);
+		while (block_group) {
+			spin_lock(&block_group->lock);
+			if (block_group->iref)
+				break;
+			spin_unlock(&block_group->lock);
+			block_group = next_block_group(info->tree_root,
+						       block_group);
+		}
+		if (!block_group) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
+
+		inode = block_group->inode;
+		block_group->iref = 0;
+		block_group->inode = NULL;
+		spin_unlock(&block_group->lock);
+		iput(inode);
+		last = block_group->key.objectid + block_group->key.offset;
+		btrfs_put_block_group(block_group);
+	}
+}
+
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	int need_clear = 0;
+	u64 cache_gen;
 
 	root = info->extent_root;
 	key.objectid = 0;
@@ -7906,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
+	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	if (cache_gen != 0 &&
+	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+		need_clear = 1;
+	if (btrfs_test_opt(root, CLEAR_CACHE))
+		need_clear = 1;
+	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0)
@@ -7928,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
 
+		if (need_clear)
+			cache->disk_cache_state = BTRFS_DC_CLEAR;
+
 		/*
 		 * we only want to have 32k of ram per block group for keeping
 		 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.offset = size;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
 
 	/*
 	 * we only want to have 32k of ram per block group for keeping track
@@ -8088,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_free_cluster *cluster;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_key key;
+	struct inode *inode;
 	int ret;
+	int factor;
 
 	root = root->fs_info->extent_root;
 
@@ -8098,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!block_group->ro);
 
 	memcpy(&key, &block_group->key, sizeof(key));
+	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+				  BTRFS_BLOCK_GROUP_RAID1 |
+				  BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
 
 	/* make sure this block group isn't part of an allocation cluster */
 	cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (!IS_ERR(inode)) {
+		btrfs_orphan_add(trans, inode);
+		clear_nlink(inode);
+		/* One for the block groups ref */
+		spin_lock(&block_group->lock);
+		if (block_group->iref) {
+			block_group->iref = 0;
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			iput(inode);
+		} else {
+			spin_unlock(&block_group->lock);
+		}
+		/* One for our lookup ref */
+		iput(inode);
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0)
+		btrfs_release_path(tree_root, path);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, tree_root, path);
+		if (ret)
+			goto out;
+		btrfs_release_path(tree_root, path);
+	}
+
 	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_lock(&block_group->space_info->lock);
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	block_group->space_info->disk_total -= block_group->key.offset * factor;
 	spin_unlock(&block_group->space_info->lock);
 
+	memcpy(&key, &block_group->key, sizeof(key));
+
 	btrfs_clear_space_info_full(root->fs_info);
 
 	btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53..eac10e3260a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask)
 {
 	tree->state = RB_ROOT;
-	tree->buffer = RB_ROOT;
+	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	return ret;
 }
 
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-					  u64 offset, struct rb_node *node)
-{
-	struct rb_root *root = &tree->buffer;
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct extent_buffer *eb;
-
-	while (*p) {
-		parent = *p;
-		eb = rb_entry(parent, struct extent_buffer, rb_node);
-
-		if (offset < eb->start)
-			p = &(*p)->rb_left;
-		else if (offset > eb->start)
-			p = &(*p)->rb_right;
-		else
-			return eb;
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-					   u64 offset)
-{
-	struct rb_root *root = &tree->buffer;
-	struct rb_node *n = root->rb_node;
-	struct extent_buffer *eb;
-
-	while (n) {
-		eb = rb_entry(n, struct extent_buffer, rb_node);
-		if (offset < eb->start)
-			n = n->rb_left;
-		else if (offset > eb->start)
-			n = n->rb_right;
-		else
-			return eb;
-	}
-	return NULL;
-}
-
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
@@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
-	u64 end;
 
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-	end = start + bvec->bv_len - 1;
 
 	bio->bi_private = NULL;
 
@@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
-	u64 unlock_start;
 	sector_t sector;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
@@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
-		unlock_start = page_end + 1;
 		goto done;
 	}
 
@@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
-			unlock_start = page_end + 1;
 			break;
 		}
 		em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			cur += iosize;
 			pg_offset += iosize;
-			unlock_start = cur;
 			continue;
 		}
 		/* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int range_whole = 0;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
 					 NULL, 1,
 					 end_bio_extent_preparewrite, 0,
 					 0, 0);
+			if (ret && !err)
+				err = ret;
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+						unsigned long start_idx)
+{
+	unsigned long index;
+	struct page *page;
+
+	if (!eb->first_page)
+		return;
+
+	index = num_extent_pages(eb->start, eb->len);
+	if (start_idx >= index)
+		return;
+
+	do {
+		index--;
+		page = extent_buffer_page(eb, index);
+		if (page)
+			page_cache_release(page);
+	} while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_release_extent_buffer_page(eb, 0);
+	__free_extent_buffer(eb);
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 					  u64 start, unsigned long len,
 					  struct page *page0,
@@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 1;
+	int ret;
 
-	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
-	if (eb) {
-		atomic_inc(&eb->refs);
-		spin_unlock(&tree->buffer_lock);
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
 		return eb;
 	}
-	spin_unlock(&tree->buffer_lock);
+	rcu_read_unlock();
 
 	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb)
@@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+
 	spin_lock(&tree->buffer_lock);
-	exists = buffer_tree_insert(tree, start, &eb->rb_node);
-	if (exists) {
+	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	if (ret == -EEXIST) {
+		exists = radix_tree_lookup(&tree->buffer,
+						start >> PAGE_CACHE_SHIFT);
 		/* add one reference for the caller */
 		atomic_inc(&exists->refs);
 		spin_unlock(&tree->buffer_lock);
+		radix_tree_preload_end();
 		goto free_eb;
 	}
 	/* add one reference for the tree */
 	atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
+	radix_tree_preload_end();
 	return eb;
 
 free_eb:
 	if (!atomic_dec_and_test(&eb->refs))
 		return exists;
-	for (index = 1; index < i; index++)
-		page_cache_release(extent_buffer_page(eb, index));
-	page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
+	btrfs_release_extent_buffer(eb);
 	return exists;
 }
 
@@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 {
 	struct extent_buffer *eb;
 
-	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
-	if (eb)
-		atomic_inc(&eb->refs);
-	spin_unlock(&tree->buffer_lock);
-
-	if (eb)
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
 		mark_page_accessed(eb->first_page);
+		return eb;
+	}
+	rcu_read_unlock();
 
-	return eb;
+	return NULL;
 }
 
 void free_extent_buffer(struct extent_buffer *eb)
@@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
+
+	btrfs_release_extent_buffer(eb);
+}
+
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
 	u64 start = page_offset(page);
 	struct extent_buffer *eb;
 	int ret = 1;
-	unsigned long i;
-	unsigned long num_pages;
 
 	spin_lock(&tree->buffer_lock);
-	eb = buffer_search(tree, start);
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (!eb)
 		goto out;
 
-	if (atomic_read(&eb->refs) > 1) {
+	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
 		ret = 0;
 		goto out;
 	}
-	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+
+	/*
+	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
+	 * Or go back.
+	 */
+	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
 		ret = 0;
 		goto out;
 	}
-	/* at this point we can safely release the extent buffer */
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++)
-		page_cache_release(extent_buffer_page(eb, i));
-	rb_erase(&eb->rb_node, &tree->buffer);
-	__free_extent_buffer(eb);
+
+	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 out:
 	spin_unlock(&tree->buffer_lock);
+
+	/* at this point we can safely release the extent buffer */
+	if (atomic_read(&eb->refs) == 0)
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590d..1c6d4f342ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct rb_root buffer;
+	struct radix_tree_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
 	spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
 	unsigned long bflags;
 	atomic_t refs;
 	struct list_head leak_list;
-	struct rb_node rb_node;
+	struct rcu_head rcu_head;
 
 	/* the spinlock is used to protect most operations */
 	spinlock_t lock;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d645..23cb8da3ff6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	if (IS_ERR(rb_node)) {
-		em = ERR_PTR(PTR_ERR(rb_node));
+		em = ERR_CAST(rb_node);
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	if (IS_ERR(rb_node)) {
-		em = ERR_PTR(PTR_ERR(rb_node));
+		em = ERR_CAST(rb_node);
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d9..22ee0dc2e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,761 @@
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
+#include "disk-io.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
 
+static void recalculate_thresholds(struct btrfs_block_group_cache
+				   *block_group);
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info);
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode = NULL;
+	int ret;
+
+	spin_lock(&block_group->lock);
+	if (block_group->inode)
+		inode = igrab(block_group->inode);
+	spin_unlock(&block_group->lock);
+	if (inode)
+		return inode;
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		btrfs_release_path(root, path);
+		return ERR_PTR(-ENOENT);
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_free_space_key(leaf, header, &disk_key);
+	btrfs_disk_key_to_cpu(&location, &disk_key);
+	btrfs_release_path(root, path);
+
+	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+	if (IS_ERR(inode))
+		return inode;
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		return ERR_PTR(-ENOENT);
+	}
+
+	spin_lock(&block_group->lock);
+	if (!root->fs_info->closing) {
+		block_group->inode = igrab(inode);
+		block_group->iref = 1;
+	}
+	spin_unlock(&block_group->lock);
+
+	return inode;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_free_space_header *header;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	u64 objectid;
+	int ret;
+
+	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+	if (ret < 0)
+		return ret;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		return ret;
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+	btrfs_item_key(leaf, &disk_key, path->slots[0]);
+	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
+	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+	btrfs_set_inode_size(leaf, inode_item, 0);
+	btrfs_set_inode_nbytes(leaf, inode_item, 0);
+	btrfs_set_inode_uid(leaf, inode_item, 0);
+	btrfs_set_inode_gid(leaf, inode_item, 0);
+	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+			      BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_nlink(leaf, inode_item, 1);
+	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+	btrfs_set_inode_block_group(leaf, inode_item,
+				    block_group->key.objectid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_free_space_header));
+	if (ret < 0) {
+		btrfs_release_path(root, path);
+		return ret;
+	}
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+	btrfs_set_free_space_key(leaf, header, &disk_key);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode)
+{
+	loff_t oldsize;
+	int ret = 0;
+
+	trans->block_rsv = root->orphan_block_rsv;
+	ret = btrfs_block_rsv_check(trans, root,
+				    root->orphan_block_rsv,
+				    0, 5);
+	if (ret)
+		return ret;
+
+	oldsize = i_size_read(inode);
+	btrfs_i_size_write(inode, 0);
+	truncate_pagecache(inode, oldsize, 0);
+
+	/*
+	 * We don't need an orphan item because truncating the free space cache
+	 * will never be split across transactions.
+	 */
+	ret = btrfs_truncate_inode_items(trans, root, inode,
+					 0, BTRFS_EXTENT_DATA_KEY);
+	if (ret) {
+		WARN_ON(1);
+		return ret;
+	}
+
+	return btrfs_update_inode(trans, root, inode);
+}
+
+static int readahead_cache(struct inode *inode)
+{
+	struct file_ra_state *ra;
+	unsigned long last_index;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	file_ra_state_init(ra, inode->i_mapping);
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+	kfree(ra);
+
+	return 0;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_root *root = fs_info->tree_root;
+	struct inode *inode;
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct page *page;
+	struct btrfs_path *path;
+	u32 *checksums = NULL, *crc;
+	char *disk_crcs = NULL;
+	struct btrfs_key key;
+	struct list_head bitmaps;
+	u64 num_entries;
+	u64 num_bitmaps;
+	u64 generation;
+	u32 cur_crc = ~(u32)0;
+	pgoff_t index = 0;
+	unsigned long first_page_offset;
+	int num_checksums;
+	int ret = 0;
+
+	/*
+	 * If we're unmounting then just return, since this does a search on the
+	 * normal root and not the commit root and we could deadlock.
+	 */
+	smp_mb();
+	if (fs_info->closing)
+		return 0;
+
+	/*
+	 * If this block group has been marked to be cleared for one reason or
+	 * another then we can't trust the on disk cache, so just return.
+	 */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	INIT_LIST_HEAD(&bitmaps);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return 0;
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode)) {
+		btrfs_free_path(path);
+		return 0;
+	}
+
+	/* Nothing in the space cache, goodbye */
+	if (!i_size_read(inode)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	num_entries = btrfs_free_space_entries(leaf, header);
+	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+	generation = btrfs_free_space_generation(leaf, header);
+	btrfs_free_path(path);
+
+	if (BTRFS_I(inode)->generation != generation) {
+		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+		       " not match free space cache generation (%llu) for "
+		       "block group %llu\n",
+		       (unsigned long long)BTRFS_I(inode)->generation,
+		       (unsigned long long)generation,
+		       (unsigned long long)block_group->key.objectid);
+		goto out;
+	}
+
+	if (!num_entries)
+		goto out;
+
+	/* Setup everything for doing checksumming */
+	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+	checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+	if (!checksums)
+		goto out;
+	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+	disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+	if (!disk_crcs)
+		goto out;
+
+	ret = readahead_cache(inode);
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	while (1) {
+		struct btrfs_free_space_entry *entry;
+		struct btrfs_free_space *e;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+		int need_loop = 0;
+
+		if (!num_entries && !num_bitmaps)
+			break;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			ret = 0;
+			goto free_cache;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				printk(KERN_ERR "btrfs: error reading free "
+				       "space cache: %llu\n",
+				       (unsigned long long)
+				       block_group->key.objectid);
+				goto free_cache;
+			}
+		}
+		addr = kmap(page);
+
+		if (index == 0) {
+			u64 *gen;
+
+			memcpy(disk_crcs, addr, first_page_offset);
+			gen = addr + (sizeof(u32) * num_checksums);
+			if (*gen != BTRFS_I(inode)->generation) {
+				printk(KERN_ERR "btrfs: space cache generation"
+				       " (%llu) does not match inode (%llu) "
+				       "for block group %llu\n",
+				       (unsigned long long)*gen,
+				       (unsigned long long)
+				       BTRFS_I(inode)->generation,
+				       (unsigned long long)
+				       block_group->key.objectid);
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+			crc = (u32 *)disk_crcs;
+		}
+		entry = addr + start_offset;
+
+		/* First lets check our crc before we do anything fun */
+		cur_crc = ~(u32)0;
+		cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+					  PAGE_CACHE_SIZE - start_offset);
+		btrfs_csum_final(cur_crc, (char *)&cur_crc);
+		if (cur_crc != *crc) {
+			printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
+			       "block group %llu\n", index,
+			       (unsigned long long)block_group->key.objectid);
+			kunmap(page);
+			unlock_page(page);
+			page_cache_release(page);
+			goto free_cache;
+		}
+		crc++;
+
+		while (1) {
+			if (!num_entries)
+				break;
+
+			need_loop = 1;
+			e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+			if (!e) {
+				kunmap(page);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			e->offset = le64_to_cpu(entry->offset);
+			e->bytes = le64_to_cpu(entry->bytes);
+			if (!e->bytes) {
+				kunmap(page);
+				kfree(e);
+				unlock_page(page);
+				page_cache_release(page);
+				goto free_cache;
+			}
+
+			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+				spin_lock(&block_group->tree_lock);
+				ret = link_free_space(block_group, e);
+				spin_unlock(&block_group->tree_lock);
+				BUG_ON(ret);
+			} else {
+				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+				if (!e->bitmap) {
+					kunmap(page);
+					kfree(e);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
+				spin_lock(&block_group->tree_lock);
+				ret = link_free_space(block_group, e);
+				block_group->total_bitmaps++;
+				recalculate_thresholds(block_group);
+				spin_unlock(&block_group->tree_lock);
+				list_add_tail(&e->list, &bitmaps);
+			}
+
+			num_entries--;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				break;
+			entry++;
+		}
+
+		/*
+		 * We read an entry out of this page, we need to move on to the
+		 * next page.
+		 */
+		if (need_loop) {
+			kunmap(page);
+			goto next;
+		}
+
+		/*
+		 * We add the bitmaps at the end of the entries in order that
+		 * the bitmap entries are added to the cache.
+		 */
+		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+		list_del_init(&e->list);
+		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+		kunmap(page);
+		num_bitmaps--;
+next:
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+
+	ret = 1;
+out:
+	kfree(checksums);
+	kfree(disk_crcs);
+	iput(inode);
+	return ret;
+
+free_cache:
+	/* This cache is bogus, make sure it gets cleared */
+	spin_lock(&block_group->lock);
+	block_group->disk_cache_state = BTRFS_DC_CLEAR;
+	spin_unlock(&block_group->lock);
+	btrfs_remove_free_space_cache(block_group);
+	goto out;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path)
+{
+	struct btrfs_free_space_header *header;
+	struct extent_buffer *leaf;
+	struct inode *inode;
+	struct rb_node *node;
+	struct list_head *pos, *n;
+	struct page *page;
+	struct extent_state *cached_state = NULL;
+	struct list_head bitmap_list;
+	struct btrfs_key key;
+	u64 bytes = 0;
+	u32 *crc, *checksums;
+	pgoff_t index = 0, last_index = 0;
+	unsigned long first_page_offset;
+	int num_checksums;
+	int entries = 0;
+	int bitmaps = 0;
+	int ret = 0;
+
+	root = root->fs_info->tree_root;
+
+	INIT_LIST_HEAD(&bitmap_list);
+
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
+	spin_unlock(&block_group->lock);
+
+	inode = lookup_free_space_inode(root, block_group, path);
+	if (IS_ERR(inode))
+		return 0;
+
+	if (!i_size_read(inode)) {
+		iput(inode);
+		return 0;
+	}
+
+	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	filemap_write_and_wait(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, inode->i_size &
+				 ~(root->sectorsize - 1), (u64)-1);
+
+	/* We need a checksum per page. */
+	num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+	crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+	if (!crc) {
+		iput(inode);
+		return 0;
+	}
+
+	/* Since the first page has all of our checksums and our generation we
+	 * need to calculate the offset into the page that we can start writing
+	 * our entries.
+	 */
+	first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+
+	node = rb_first(&block_group->free_space_offset);
+	if (!node)
+		goto out_free;
+
+	/*
+	 * Lock all pages first so we can lock the extent safely.
+	 *
+	 * NOTE: Because we hold the ref the entire time we're going to write to
+	 * the page find_get_page should never fail, so we don't do a check
+	 * after find_get_page at this point.  Just putting this here so people
+	 * know and don't freak out.
+	 */
+	while (index <= last_index) {
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			pgoff_t i = 0;
+
+			while (i < index) {
+				page = find_get_page(inode->i_mapping, i);
+				unlock_page(page);
+				page_cache_release(page);
+				page_cache_release(page);
+				i++;
+			}
+			goto out_free;
+		}
+		index++;
+	}
+
+	index = 0;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+			 0, &cached_state, GFP_NOFS);
+
+	/* Write out the extent entries */
+	do {
+		struct btrfs_free_space_entry *entry;
+		void *addr;
+		unsigned long offset = 0;
+		unsigned long start_offset = 0;
+
+		if (index == 0) {
+			start_offset = first_page_offset;
+			offset = start_offset;
+		}
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		entry = addr + start_offset;
+
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		while (1) {
+			struct btrfs_free_space *e;
+
+			e = rb_entry(node, struct btrfs_free_space, offset_index);
+			entries++;
+
+			entry->offset = cpu_to_le64(e->offset);
+			entry->bytes = cpu_to_le64(e->bytes);
+			if (e->bitmap) {
+				entry->type = BTRFS_FREE_SPACE_BITMAP;
+				list_add_tail(&e->list, &bitmap_list);
+				bitmaps++;
+			} else {
+				entry->type = BTRFS_FREE_SPACE_EXTENT;
+			}
+			node = rb_next(node);
+			if (!node)
+				break;
+			offset += sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=
+			    PAGE_CACHE_SIZE)
+				break;
+			entry++;
+		}
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr + start_offset, *crc,
+				       PAGE_CACHE_SIZE - start_offset);
+		kunmap(page);
+
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+
+		bytes += PAGE_CACHE_SIZE;
+
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+
+		/*
+		 * We need to release our reference we got for grab_cache_page,
+		 * except for the first page which will hold our checksums, we
+		 * do that below.
+		 */
+		if (index != 0) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+
+		page_cache_release(page);
+
+		index++;
+	} while (node);
+
+	/* Write out the bitmaps */
+	list_for_each_safe(pos, n, &bitmap_list) {
+		void *addr;
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+		*crc = ~(u32)0;
+		*crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+		kunmap(page);
+		btrfs_csum_final(*crc, (char *)crc);
+		crc++;
+		bytes += PAGE_CACHE_SIZE;
+
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+		list_del_init(&entry->list);
+		index++;
+	}
+
+	/* Zero out the rest of the pages just to make sure */
+	while (index <= last_index) {
+		void *addr;
+
+		page = find_get_page(inode->i_mapping, index);
+
+		addr = kmap(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		kunmap(page);
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+		bytes += PAGE_CACHE_SIZE;
+		index++;
+	}
+
+	btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
+
+	/* Write the checksums and trans id to the first page */
+	{
+		void *addr;
+		u64 *gen;
+
+		page = find_get_page(inode->i_mapping, 0);
+
+		addr = kmap(page);
+		memcpy(addr, checksums, sizeof(u32) * num_checksums);
+		gen = addr + (sizeof(u32) * num_checksums);
+		*gen = trans->transid;
+		kunmap(page);
+		ClearPageChecked(page);
+		set_page_extent_mapped(page);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		page_cache_release(page);
+	}
+	BTRFS_I(inode)->generation = trans->transid;
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+	filemap_write_and_wait(inode->i_mapping);
+
+	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+	key.offset = block_group->key.objectid;
+	key.type = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	if (ret < 0) {
+		ret = 0;
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+		goto out_free;
+	}
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		struct btrfs_key found_key;
+		BUG_ON(!path->slots[0]);
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+		    found_key.offset != block_group->key.objectid) {
+			ret = 0;
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+					 GFP_NOFS);
+			btrfs_release_path(root, path);
+			goto out_free;
+		}
+	}
+	header = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_free_space_header);
+	btrfs_set_free_space_entries(leaf, header, entries);
+	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+	btrfs_set_free_space_generation(leaf, header, trans->transid);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	ret = 1;
+
+out_free:
+	if (ret == 0) {
+		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+		BTRFS_I(inode)->generation = 0;
+	}
+	kfree(checksums);
+	btrfs_update_inode(trans, root, inode);
+	iput(inode);
+	return ret;
+}
+
 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
 					  u64 offset)
 {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011..e49ca5c321b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
 	struct list_head list;
 };
 
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+				      struct btrfs_block_group_cache
+				      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+			    struct btrfs_trans_handle *trans,
+			    struct btrfs_block_group_cache *block_group,
+			    struct btrfs_path *path);
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+			  struct btrfs_block_group_cache *block_group);
+int btrfs_write_out_cache(struct btrfs_root *root,
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_block_group_cache *block_group,
+			  struct btrfs_path *path);
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad474..558cac2dfa5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 num_bytes;
-	u64 orig_start;
-	u64 disk_num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
 	u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
 	int i;
 	int will_compress;
 
-	orig_start = start;
-
 	actual_end = min_t(u64, isize, end + 1);
 again:
 	will_compress = 0;
@@ -371,7 +367,6 @@ again:
 	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	disk_num_bytes = num_bytes;
 	total_in = 0;
 	ret = 0;
 
@@ -467,7 +462,6 @@ again:
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
-			disk_num_bytes = total_compressed;
 			num_bytes = total_in;
 		}
 	}
@@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode,
 	u64 disk_num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 actual_end;
-	u64 isize = i_size_read(inode);
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
+	BUG_ON(root == root->fs_info->tree_root);
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
-	actual_end = min_t(u64, isize, end + 1);
-
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
@@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	int type;
 	int nocow;
 	int check_prev = 1;
+	bool nolock = false;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	trans = btrfs_join_transaction(root, 1);
+	if (root == root->fs_info->tree_root) {
+		nolock = true;
+		trans = btrfs_join_transaction_nolock(root, 1);
+	} else {
+		trans = btrfs_join_transaction(root, 1);
+	}
 	BUG_ON(!trans);
 
 	cow_start = (u64)-1;
@@ -1211,8 +1208,13 @@ out_check:
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
+	if (nolock) {
+		ret = btrfs_end_transaction_nolock(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+	}
 	btrfs_free_path(path);
 	return 0;
 }
@@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
+		int do_list = (root->root_key.objectid !=
+			       BTRFS_ROOT_TREE_OBJECTID);
 
 		if (*bits & EXTENT_FIRST_DELALLOC)
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
 		root->fs_info->delalloc_bytes += len;
-		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
 				      &root->fs_info->delalloc_inodes);
 		}
@@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
+		int do_list = (root->root_key.objectid !=
+			       BTRFS_ROOT_TREE_OBJECTID);
 
 		if (*bits & EXTENT_FIRST_DELALLOC)
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 		if (*bits & EXTENT_DO_ACCOUNTING)
 			btrfs_delalloc_release_metadata(inode, len);
 
-		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+		    && do_list)
 			btrfs_free_reserved_data_space(inode, len);
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		root->fs_info->delalloc_bytes -= len;
 		BTRFS_I(inode)->delalloc_bytes -= len;
 
-		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
 		}
@@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
 	if (map_length < length + size)
 		return 1;
-	return 0;
+	return ret;
 }
 
 /*
@@ -1426,10 +1433,13 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (root == root->fs_info->tree_root)
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+	else
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
@@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct extent_state *cached_state = NULL;
 	int compressed = 0;
 	int ret;
+	bool nolock = false;
 
 	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
 					     end - start + 1);
@@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		return 0;
 	BUG_ON(!ordered_extent);
 
+	nolock = (root == root->fs_info->tree_root);
+
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list));
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (!ret) {
-			trans = btrfs_join_transaction(root, 1);
+			if (nolock)
+				trans = btrfs_join_transaction_nolock(root, 1);
+			else
+				trans = btrfs_join_transaction(root, 1);
+			BUG_ON(!trans);
 			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			 ordered_extent->file_offset + ordered_extent->len - 1,
 			 0, &cached_state, GFP_NOFS);
 
-	trans = btrfs_join_transaction(root, 1);
+	if (nolock)
+		trans = btrfs_join_transaction_nolock(root, 1);
+	else
+		trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->len);
 		BUG_ON(ret);
 	} else {
+		BUG_ON(root == root->fs_info->tree_root);
 		ret = insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
@@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (trans)
-		btrfs_end_transaction(trans, root);
+	if (nolock) {
+		if (trans)
+			btrfs_end_transaction_nolock(trans, root);
+	} else {
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+		if (trans)
+			btrfs_end_transaction(trans, root);
+	}
+
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
@@ -1841,7 +1868,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_size = 0;
 
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & (1 << BIO_RW))
+	if (failed_bio->bi_rw & REQ_WRITE)
 		rw = WRITE;
 	else
 		rw = READ;
@@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	struct btrfs_key key, found_key;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
@@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		/* pull out the item */
 		leaf = path->nodes[0];
-		item = btrfs_item_nr(leaf, path->slots[0]);
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
 		/* make sure the item matches what we want */
@@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
-	BUG_ON(ret);
+	if (ret == -ENOENT)
+		ret = 0;
 err:
 	btrfs_free_path(path);
 	if (ret)
@@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root,
 {
 	struct extent_buffer *eb;
 	int level;
-	int ret;
 	u64 refs = 1;
+	int uninitialized_var(ret);
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 		if (!path->nodes[level])
@@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root,
 		if (refs > 1)
 			return 1;
 	}
-	return 0;
+	return ret; /* XXX callers? */
 }
 
 /*
@@ -2938,7 +2964,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, dir);
 	BUG_ON(ret);
-	dir->i_sb->s_dirt = 1;
 
 	btrfs_free_path(path);
 	return 0;
@@ -3197,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
-	if (root->ref_cows)
+	if (root->ref_cows || root == root->fs_info->tree_root)
 		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 
 	path = btrfs_alloc_path();
@@ -3345,7 +3370,8 @@ delete:
 		} else {
 			break;
 		}
-		if (found_extent && root->ref_cows) {
+		if (found_extent && (root->ref_cows ||
+				     root == root->fs_info->tree_root)) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
@@ -3656,17 +3682,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (err)
 			return err;
 	}
-	attr->ia_valid &= ~ATTR_SIZE;
 
-	if (attr->ia_valid)
-		err = inode_setattr(inode, attr);
+	if (attr->ia_valid) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+
+		if (attr->ia_valid & ATTR_MODE)
+			err = btrfs_acl_chmod(inode);
+	}
 
-	if (!err && ((attr->ia_valid & ATTR_MODE)))
-		err = btrfs_acl_chmod(inode);
 	return err;
 }
 
-void btrfs_delete_inode(struct inode *inode)
+void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3674,10 +3702,15 @@ void btrfs_delete_inode(struct inode *inode)
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+			       root == root->fs_info->tree_root))
+		goto no_delete;
+
 	if (is_bad_inode(inode)) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
+	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
@@ -3727,7 +3760,7 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 no_delete:
-	clear_inode(inode);
+	end_writeback(inode);
 	return;
 }
 
@@ -3844,7 +3877,7 @@ again:
 	p = &root->inode_tree.rb_node;
 	parent = NULL;
 
-	if (hlist_unhashed(&inode->i_hash))
+	if (inode_unhashed(inode))
 		return;
 
 	spin_lock(&root->inode_lock);
@@ -3858,7 +3891,7 @@ again:
 			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
-				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+				  (I_WILL_FREE | I_FREEING)));
 			rb_erase(parent, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
@@ -3883,7 +3916,14 @@ static void inode_tree_del(struct inode *inode)
 	}
 	spin_unlock(&root->inode_lock);
 
-	if (empty && btrfs_root_refs(&root->root_item) == 0) {
+	/*
+	 * Free space cache has inodes in the tree root, but the tree root has a
+	 * root_refs of 0, so this could end up dropping the tree root as a
+	 * snapshot, so we need the extra !root->fs_info->tree_root check to
+	 * make sure we don't drop it.
+	 */
+	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root) {
 		synchronize_srcu(&root->fs_info->subvol_srcu);
 		spin_lock(&root->inode_lock);
 		empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3937,7 +3977,7 @@ again:
 			if (atomic_read(&inode->i_count) > 1)
 				d_prune_aliases(inode);
 			/*
-			 * btrfs_drop_inode will remove it from
+			 * btrfs_drop_inode will have it removed from
 			 * the inode cache when its usage count
 			 * hits zero.
 			 */
@@ -4277,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	bool nolock = false;
 
 	if (BTRFS_I(inode)->dummy_inode)
 		return 0;
 
+	smp_mb();
+	nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+
 	if (wbc->sync_mode == WB_SYNC_ALL) {
-		trans = btrfs_join_transaction(root, 1);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root, 1);
+		else
+			trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
-		ret = btrfs_commit_transaction(trans, root);
+		if (nolock)
+			ret = btrfs_end_transaction_nolock(trans, root);
+		else
+			ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
@@ -4753,7 +4803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	btrfs_set_trans_block_group(trans, dir);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
@@ -5640,9 +5690,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_private *dip;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	u64 start;
 	int skip_sum;
-	int write = rw & (1 << BIO_RW);
+	int write = rw & REQ_WRITE;
 	int ret = 0;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -5666,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
 
-	start = dip->logical_offset;
 	dip->bytes = 0;
 	do {
 		dip->bytes += bvec->bv_len;
@@ -6303,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
+	if (root == root->fs_info->tree_root) {
+		struct btrfs_block_group_cache *block_group;
+
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						BTRFS_I(inode)->block_group);
+		if (block_group && block_group->inode == inode) {
+			spin_lock(&block_group->lock);
+			block_group->inode = NULL;
+			spin_unlock(&block_group->lock);
+			btrfs_put_block_group(block_group);
+		} else if (block_group) {
+			btrfs_put_block_group(block_group);
+		}
+	}
+
 	spin_lock(&root->orphan_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6331,13 +6394,15 @@ free:
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-void btrfs_drop_inode(struct inode *inode)
+int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
-		generic_delete_inode(inode);
+
+	if (btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root)
+		return 1;
 	else
-		generic_drop_inode(inode);
+		return generic_drop_inode(inode);
 }
 
 static void init_once(void *foo)
@@ -6603,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return 0;
 }
 
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode = NULL;
@@ -6625,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
 	spin_unlock(&root->fs_info->delalloc_lock);
 
 	if (inode) {
-		write_inode_now(inode, 0);
+		if (sync) {
+			filemap_write_and_wait(inode->i_mapping);
+			/*
+			 * We have to do this because compression doesn't
+			 * actually set PG_writeback until it submits the pages
+			 * for IO, which happens in an async thread, so we could
+			 * race and not actually wait for any writeback pages
+			 * because they've not been submitted yet.  Technically
+			 * this could still be the case for the ordered stuff
+			 * since the async thread may not have started to do its
+			 * work yet.  If this becomes the case then we need to
+			 * figure out a way to make sure that in writepage we
+			 * wait for any async pages to be submitted before
+			 * returning so that fdatawait does what its supposed to
+			 * do.
+			 */
+			btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		} else {
+			filemap_flush(inode->i_mapping);
+		}
 		if (delay_iput)
 			btrfs_add_delayed_iput(inode);
 		else
@@ -6751,27 +6836,33 @@ out_unlock:
 	return err;
 }
 
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
-			      u64 start, u64 num_bytes, u64 min_size,
-			      loff_t actual_len, u64 *alloc_hint)
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+				       u64 start, u64 num_bytes, u64 min_size,
+				       loff_t actual_len, u64 *alloc_hint,
+				       struct btrfs_trans_handle *trans)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
 	int ret = 0;
+	bool own_trans = true;
 
+	if (trans)
+		own_trans = false;
 	while (num_bytes > 0) {
-		trans = btrfs_start_transaction(root, 3);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
+		if (own_trans) {
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
 		}
 
 		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
 					   0, *alloc_hint, (u64)-1, &ins, 1);
 		if (ret) {
-			btrfs_end_transaction(trans, root);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
 			break;
 		}
 
@@ -6804,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
 
-		btrfs_end_transaction(trans, root);
+		if (own_trans)
+			btrfs_end_transaction(trans, root);
 	}
 	return ret;
 }
 
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint,
+					   NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint)
+{
+	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+					   min_size, actual_len, alloc_hint, trans);
+}
+
 static long btrfs_fallocate(struct inode *inode, int mode,
 			    loff_t offset, loff_t len)
 {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58db..463d91b4dd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
-				  char *name, int namelen)
+				  char *name, int namelen,
+				  u64 *async_transid)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-	err = btrfs_commit_transaction(trans, root);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		err = btrfs_commit_transaction_async(trans, root, 1);
+	} else {
+		err = btrfs_commit_transaction(trans, root);
+	}
 	if (err && !ret)
 		ret = err;
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+			   char *name, int namelen, u64 *async_transid)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
 
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-	ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+	if (async_transid) {
+		*async_transid = trans->transid;
+		ret = btrfs_commit_transaction_async(trans,
+				     root->fs_info->extent_root, 1);
+	} else {
+		ret = btrfs_commit_transaction(trans,
+					       root->fs_info->extent_root);
+	}
 	BUG_ON(ret);
 
 	ret = pending_snapshot->error;
@@ -395,6 +409,76 @@ fail:
 	return ret;
 }
 
+/*  copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+	uid_t fsuid = current_fsuid();
+
+	if (!(dir->i_mode & S_ISVTX))
+		return 0;
+	if (inode->i_uid == fsuid)
+		return 0;
+	if (dir->i_uid == fsuid)
+		return 0;
+	return !capable(CAP_FOWNER);
+}
+
+/*  copy of may_delete in fs/namei.c()
+ *	Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *	a. be owner of dir, or
+ *	b. be owner of victim, or
+ *	c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+	int error;
+
+	if (!victim->d_inode)
+		return -ENOENT;
+
+	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim, dir);
+
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+	if (IS_APPEND(dir))
+		return -EPERM;
+	if (btrfs_check_sticky(dir, victim->d_inode)||
+		IS_APPEND(victim->d_inode)||
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+		return -EPERM;
+	if (isdir) {
+		if (!S_ISDIR(victim->d_inode->i_mode))
+			return -ENOTDIR;
+		if (IS_ROOT(victim))
+			return -EBUSY;
+	} else if (S_ISDIR(victim->d_inode->i_mode))
+		return -EISDIR;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+		return -EBUSY;
+	return 0;
+}
+
 /* copy of may_create in fs/namei.c() */
 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 {
@@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  */
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
-				   struct btrfs_root *snap_src)
+				   struct btrfs_root *snap_src,
+				   u64 *async_transid)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
@@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry);
+		error = create_snapshot(snap_src, dentry,
+					name, namelen, async_transid);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen);
+				      name, namelen, async_transid);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	char *sizestr;
 	char *devstr = NULL;
 	int ret = 0;
-	int namelen;
 	int mod = 0;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		return PTR_ERR(vol_args);
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	namelen = strlen(vol_args->name);
 
 	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
@@ -801,11 +885,13 @@ out_unlock:
 	return ret;
 }
 
-static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg, int subvol)
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+						    char *name,
+						    unsigned long fd,
+						    int subvol,
+						    u64 *transid)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-	struct btrfs_ioctl_vol_args *vol_args;
 	struct file *src_file;
 	int namelen;
 	int ret = 0;
@@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
-
-	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	namelen = strlen(vol_args->name);
-	if (strchr(vol_args->name, '/')) {
+	namelen = strlen(name);
+	if (strchr(name, '/')) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (subvol) {
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
-				     NULL);
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     NULL, transid);
 	} else {
 		struct inode *src_inode;
-		src_file = fget(vol_args->fd);
+		src_file = fget(fd);
 		if (!src_file) {
 			ret = -EINVAL;
 			goto out;
@@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 			fput(src_file);
 			goto out;
 		}
-		ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
-				     BTRFS_I(src_inode)->root);
+		ret = btrfs_mksubvol(&file->f_path, name, namelen,
+				     BTRFS_I(src_inode)->root,
+				     transid);
 		fput(src_file);
 	}
 out:
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+					    void __user *arg, int subvol,
+					    int async)
+{
+	struct btrfs_ioctl_vol_args *vol_args = NULL;
+	struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+	char *name;
+	u64 fd;
+	u64 transid = 0;
+	int ret;
+
+	if (async) {
+		async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+		if (IS_ERR(async_vol_args))
+			return PTR_ERR(async_vol_args);
+
+		name = async_vol_args->name;
+		fd = async_vol_args->fd;
+		async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+	} else {
+		vol_args = memdup_user(arg, sizeof(*vol_args));
+		if (IS_ERR(vol_args))
+			return PTR_ERR(vol_args);
+		name = vol_args->name;
+		fd = vol_args->fd;
+		vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	}
+
+	ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+					      subvol, &transid);
+
+	if (!ret && async) {
+		if (copy_to_user(arg +
+				offsetof(struct btrfs_ioctl_async_vol_args,
+				transid), &transid, sizeof(transid)))
+			return -EFAULT;
+	}
+
 	kfree(vol_args);
+	kfree(async_vol_args);
+
 	return ret;
 }
 
@@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	args = kmalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return -ENOMEM;
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
 
-	if (copy_from_user(args, argp, sizeof(*args))) {
-		kfree(args);
-		return -EFAULT;
-	}
 	inode = fdentry(file)->d_inode;
 	ret = search_ioctl(inode, args);
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	args = kmalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return -ENOMEM;
+	args = memdup_user(argp, sizeof(*args));
+	if (IS_ERR(args))
+		return PTR_ERR(args);
 
-	if (copy_from_user(args, argp, sizeof(*args))) {
-		kfree(args);
-		return -EFAULT;
-	}
 	inode = fdentry(file)->d_inode;
 
 	if (args->treeid == 0)
@@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	int ret;
 	int err = 0;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
 		return PTR_ERR(vol_args);
@@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	}
 
 	inode = dentry->d_inode;
+	dest = BTRFS_I(inode)->root;
+	if (!capable(CAP_SYS_ADMIN)){
+		/*
+		 * Regular user.  Only allow this with a special mount
+		 * option, when the user has write+exec access to the
+		 * subvol root, and when rmdir(2) would have been
+		 * allowed.
+		 *
+		 * Note that this is _not_ check that the subvol is
+		 * empty or doesn't contain data that we wouldn't
+		 * otherwise be able to delete.
+		 *
+		 * Users who want to delete empty subvols should try
+		 * rmdir(2).
+		 */
+		err = -EPERM;
+		if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+			goto out_dput;
+
+		/*
+		 * Do not allow deletion if the parent dir is the same
+		 * as the dir to be deleted.  That means the ioctl
+		 * must be called on the dentry referencing the root
+		 * of the subvol, not a random directory contained
+		 * within it.
+		 */
+		err = -EINVAL;
+		if (root == dest)
+			goto out_dput;
+
+		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		if (err)
+			goto out_dput;
+
+		/* check if subvolume may be deleted by a non-root user */
+		err = btrfs_may_delete(dir, dentry, 1);
+		if (err)
+			goto out_dput;
+	}
+
 	if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
 		err = -EINVAL;
 		goto out_dput;
 	}
 
-	dest = BTRFS_I(inode)->root;
-
 	mutex_lock(&inode->i_mutex);
 	err = d_invalidate(dentry);
 	if (err)
@@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_commit_transaction(trans, root);
+	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	inode->i_flags |= S_DEAD;
 out_up_write:
@@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	path->reada = 2;
 
 	if (inode < src) {
-		mutex_lock(&inode->i_mutex);
-		mutex_lock(&src->i_mutex);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
 	} else {
-		mutex_lock(&src->i_mutex);
-		mutex_lock(&inode->i_mutex);
+		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 	}
 
 	/* determine range to clone */
@@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
-		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+		if (!ordered &&
+		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+				   EXTENT_DELALLOC, 0, NULL))
 			break;
 		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, off, off+len);
+		btrfs_wait_ordered_range(src, off, len);
 	}
 
 	/* clone data */
@@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			}
 			btrfs_release_path(root, path);
 
-			if (key.offset + datal < off ||
+			if (key.offset + datal <= off ||
 			    key.offset >= off+len)
 				goto next;
 
@@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
+static void get_block_group_info(struct list_head *groups_list,
+				 struct btrfs_ioctl_space_info *space)
+{
+	struct btrfs_block_group_cache *block_group;
+
+	space->total_bytes = 0;
+	space->used_bytes = 0;
+	space->flags = 0;
+	list_for_each_entry(block_group, groups_list, list) {
+		space->flags = block_group->flags;
+		space->total_bytes += block_group->key.offset;
+		space->used_bytes +=
+			btrfs_block_group_used(&block_group->item);
+	}
+}
+
 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_space_info *dest_orig;
 	struct btrfs_ioctl_space_info *user_dest;
 	struct btrfs_space_info *info;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
 	int alloc_size;
 	int ret = 0;
 	int slot_count = 0;
+	int i, c;
 
 	if (copy_from_user(&space_args,
 			   (struct btrfs_ioctl_space_args __user *)arg,
 			   sizeof(space_args)))
 		return -EFAULT;
 
-	/* first we count slots */
-	rcu_read_lock();
-	list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
-		slot_count++;
-	rcu_read_unlock();
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!info)
+			continue;
+
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c]))
+				slot_count++;
+		}
+		up_read(&info->groups_sem);
+	}
 
 	/* space_slots == 0 means they are asking for a count */
 	if (space_args.space_slots == 0) {
 		space_args.total_spaces = slot_count;
 		goto out;
 	}
+
+	slot_count = min_t(int, space_args.space_slots, slot_count);
+
 	alloc_size = sizeof(*dest) * slot_count;
+
 	/* we generally have at most 6 or so space infos, one for each raid
 	 * level.  So, a whole page should be more than enough for everyone
 	 */
@@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	dest_orig = dest;
 
 	/* now we have a buffer to copy into */
-	rcu_read_lock();
-	list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
-		/* make sure we don't copy more than we allocated
-		 * in our buffer
-		 */
-		if (slot_count == 0)
-			break;
-		slot_count--;
-
-		/* make sure userland has enough room in their buffer */
-		if (space_args.total_spaces >= space_args.space_slots)
-			break;
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		info = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+					list) {
+			if (tmp->flags == types[i]) {
+				info = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
 
-		space.flags = info->flags;
-		space.total_bytes = info->total_bytes;
-		space.used_bytes = info->bytes_used;
-		memcpy(dest, &space, sizeof(space));
-		dest++;
-		space_args.total_spaces++;
+		if (!info)
+			continue;
+		down_read(&info->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&info->block_groups[c])) {
+				get_block_group_info(&info->block_groups[c],
+						     &space);
+				memcpy(dest, &space, sizeof(space));
+				dest++;
+				space_args.total_spaces++;
+			}
+		}
+		up_read(&info->groups_sem);
 	}
-	rcu_read_unlock();
 
 	user_dest = (struct btrfs_ioctl_space_info *)
 		(arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file)
 	return 0;
 }
 
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 transid;
+
+	trans = btrfs_start_transaction(root, 0);
+	transid = trans->transid;
+	btrfs_commit_transaction_async(trans, root, 0);
+
+	if (argp)
+		if (copy_to_user(argp, &transid, sizeof(transid)))
+			return -EFAULT;
+	return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+	u64 transid;
+
+	if (argp) {
+		if (copy_from_user(&transid, argp, sizeof(transid)))
+			return -EFAULT;
+	} else {
+		transid = 0;  /* current trans */
+	}
+	return btrfs_wait_for_commit(root, transid);
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case FS_IOC_GETVERSION:
 		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 0);
+		return btrfs_ioctl_snap_create(file, argp, 0, 0);
+	case BTRFS_IOC_SNAP_CREATE_ASYNC:
+		return btrfs_ioctl_snap_create(file, argp, 0, 1);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 1);
+		return btrfs_ioctl_snap_create(file, argp, 1, 0);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
+	case BTRFS_IOC_START_SYNC:
+		return btrfs_ioctl_start_sync(file, argp);
+	case BTRFS_IOC_WAIT_SYNC:
+		return btrfs_ioctl_wait_sync(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517..17c99ebdf96 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4087
 
 /* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
 struct btrfs_ioctl_vol_args {
 	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+#define BTRFS_SNAPSHOT_NAME_MAX 4079
+struct btrfs_ioctl_async_vol_args {
+	__s64 fd;
+	__u64 transid;
+	char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
 	__u64 treeid;
@@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
 				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_async_vol_args)
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5ad..f4621f6deca 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
 	u64 orig_end;
-	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
 	int found;
 
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		if (orig_end > INT_LIMIT(loff_t))
 			orig_end = INT_LIMIT(loff_t);
 	}
-	wait_end = orig_end;
 again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4..045c9c2b2d7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
 #include "locking.h"
 #include "btrfs_inode.h"
 #include "async-thread.h"
+#include "free-space-cache.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -178,8 +179,6 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	int block_rsv_retries;
-
 	unsigned int stage:8;
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
@@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	LIST_HEAD(reloc_roots);
 	u64 num_bytes = 0;
 	int ret;
-	int retries = 0;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2141,7 @@ again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
 		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes, &retries);
+					  num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2155,7 +2153,6 @@ again:
 			btrfs_end_transaction(trans, rc->extent_root);
 			btrfs_block_rsv_release(rc->extent_root,
 						rc->block_rsv, num_bytes);
-			retries = 0;
 			goto again;
 		}
 	}
@@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-				  &rc->block_rsv_retries);
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
 		return ret;
 	}
 
-	rc->block_rsv_retries = 0;
 	return 0;
 }
 
@@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 		ret = get_ref_objectid_v0(rc, path, extent_key,
 					  &ref_owner, NULL);
+		if (ret < 0)
+			return ret;
 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
 		level = (int)ref_owner;
 		/* FIXME: get real generation */
@@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc,
 	return ret;
 }
 
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+				    struct inode *inode, u64 ino)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	int ret = 0;
+
+	if (inode)
+		goto truncate;
+
+	key.objectid = ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+		if (inode && !IS_ERR(inode))
+			iput(inode);
+		return -ENOENT;
+	}
+
+truncate:
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+
+	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+
+	btrfs_free_path(path);
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+out:
+	iput(inode);
+	return ret;
+}
+
 /*
  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
  * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
 	int counted;
 	int ret;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
 
+	/*
+	 * This is an extent belonging to the free space cache, lets just delete
+	 * it and redo the search.
+	 */
+	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       NULL, ref_objectid);
+		if (ret != -ENOENT)
+			return ret;
+		ret = 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
 	if (IS_ERR(root)) {
 		err = PTR_ERR(root);
@@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256,
-				  &rc->block_rsv_retries);
+				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
 
@@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
-	rc->block_rsv_retries = 0;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
@@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 {
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	struct reloc_control *rc;
+	struct inode *inode;
+	struct btrfs_path *path;
 	int ret;
 	int rw = 0;
 	int err = 0;
@@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		rw = 1;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+					path);
+	btrfs_free_path(path);
+
+	if (!IS_ERR(inode))
+		ret = delete_block_group_cache(fs_info, inode, 0);
+	else
+		ret = PTR_ERR(inode);
+
+	if (ret && ret != -ENOENT) {
+		err = ret;
+		goto out;
+	}
+
 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
 	if (IS_ERR(rc->data_inode)) {
 		err = PTR_ERR(rc->data_inode);
@@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
 	btrfs_put_ordered_extent(ordered);
-	return 0;
+	return ret;
 }
 
 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c..6a1086e83ff 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_root *dead_root;
-	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
 			nritems = btrfs_header_nritems(leaf);
 			slot = path->slots[0];
 		}
-		item = btrfs_item_nr(leaf, slot);
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
 			goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2393b39031..8299a25ffc8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
 
 	ret = close_ctree(root);
 	sb->s_fs_info = NULL;
+
+	(void)ret; /* FIXME: need to fix VFS to return error? */
 }
 
 enum {
@@ -68,7 +70,8 @@ enum {
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_err,
+	Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+	Opt_user_subvol_rm_allowed,
 };
 
 static match_table_t tokens = {
@@ -92,6 +95,9 @@ static match_table_t tokens = {
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
 	{Opt_discard, "discard"},
+	{Opt_space_cache, "space_cache"},
+	{Opt_clear_cache, "clear_cache"},
+	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_err, NULL},
 };
 
@@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_discard:
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
+		case Opt_space_cache:
+			printk(KERN_INFO "btrfs: enabling disk space caching\n");
+			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+		case Opt_clear_cache:
+			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+			break;
+		case Opt_user_subvol_rm_allowed:
+			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 find_root:
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
 	if (IS_ERR(new_root))
-		return ERR_PTR(PTR_ERR(new_root));
+		return ERR_CAST(new_root);
 
 	if (btrfs_root_refs(&new_root->root_item) == 0)
 		return ERR_PTR(-ENOENT);
@@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
 	struct btrfs_key key;
 	int err;
@@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		return PTR_ERR(tree_root);
 	}
 	sb->s_fs_info = tree_root;
-	disk_super = &tree_root->fs_info->super_copy;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
@@ -560,8 +574,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
  * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
  *	  for multiple device setup.  Make sure to keep it in sync.
  */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
@@ -571,7 +585,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
 	int error = 0;
-	int found = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
@@ -580,7 +593,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 					  &subvol_name, &subvol_objectid,
 					  &fs_devices);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
 	if (error)
@@ -607,7 +620,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
-		found = 1;
 		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -629,7 +641,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(root)) {
 		error = PTR_ERR(root);
 		deactivate_locked_super(s);
-		goto error;
+		goto error_free_subvol_name;
 	}
 	/* if they gave us a subvolume name bind mount into that */
 	if (strcmp(subvol_name, ".")) {
@@ -643,24 +655,21 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			deactivate_locked_super(s);
 			error = PTR_ERR(new_root);
 			dput(root);
-			goto error_close_devices;
+			goto error_free_subvol_name;
 		}
 		if (!new_root->d_inode) {
 			dput(root);
 			dput(new_root);
 			deactivate_locked_super(s);
 			error = -ENXIO;
-			goto error_close_devices;
+			goto error_free_subvol_name;
 		}
 		dput(root);
 		root = new_root;
 	}
 
-	mnt->mnt_sb = s;
-	mnt->mnt_root = root;
-
 	kfree(subvol_name);
-	return 0;
+	return root;
 
 error_s:
 	error = PTR_ERR(s);
@@ -668,8 +677,7 @@ error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
-error:
-	return error;
+	return ERR_PTR(error);
 }
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -716,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
+	u64 total_used_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(found, head, list)
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+				    BTRFS_BLOCK_GROUP_SYSTEM))
+			total_used_data += found->disk_total;
+		else
+			total_used_data += found->disk_used;
 		total_used += found->disk_used;
+	}
 	rcu_read_unlock();
 
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
 	buf->f_bfree = buf->f_blocks - (total_used >> bits);
-	buf->f_bavail = buf->f_bfree;
+	buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 
@@ -746,7 +761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
-	.get_sb		= btrfs_get_sb,
+	.mount		= btrfs_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -797,7 +812,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 
 static const struct super_operations btrfs_super_ops = {
 	.drop_inode	= btrfs_drop_inode,
-	.delete_inode	= btrfs_delete_inode,
+	.evict_inode	= btrfs_evict_inode,
 	.put_super	= btrfs_put_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
@@ -815,6 +830,7 @@ static const struct file_operations btrfs_ctl_fops = {
 	.unlocked_ioctl	 = btrfs_control_ioctl,
 	.compat_ioctl = btrfs_control_ioctl,
 	.owner	 = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
 static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63..1fffbc017bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
 	TRANS_START,
 	TRANS_JOIN,
 	TRANS_USERSPACE,
+	TRANS_JOIN_NOLOCK,
 };
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
-	int retries = 0;
 	int ret;
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_lock(&root->fs_info->trans_mutex);
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
@@ -195,7 +196,8 @@ again:
 
 	cur_trans = root->fs_info->running_transaction;
 	cur_trans->use_count++;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_unlock(&root->fs_info->trans_mutex);
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
@@ -212,8 +214,7 @@ again:
 	}
 
 	if (num_items > 0) {
-		ret = btrfs_trans_reserve_metadata(h, root, num_items,
-						   &retries);
+		ret = btrfs_trans_reserve_metadata(h, root, num_items);
 		if (ret == -EAGAIN) {
 			btrfs_commit_transaction(h, root);
 			goto again;
@@ -224,9 +225,11 @@ again:
 		}
 	}
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_lock(&root->fs_info->trans_mutex);
 	record_root_in_trans(h, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	if (type != TRANS_JOIN_NOLOCK)
+		mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (!current->journal_info && type != TRANS_USERSPACE)
 		current->journal_info = h;
@@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 	return start_transaction(root, 0, TRANS_JOIN);
 }
 
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+							  int num_blocks)
+{
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks)
 {
@@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+	struct btrfs_transaction *cur_trans = NULL, *t;
+	int ret;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+
+	ret = 0;
+	if (transid) {
+		if (transid <= root->fs_info->last_trans_committed)
+			goto out_unlock;
+
+		/* find specified transaction */
+		list_for_each_entry(t, &root->fs_info->trans_list, list) {
+			if (t->transid == transid) {
+				cur_trans = t;
+				break;
+			}
+			if (t->transid > transid)
+				break;
+		}
+		ret = -EINVAL;
+		if (!cur_trans)
+			goto out_unlock;  /* bad transid */
+	} else {
+		/* find newest transaction that is committing | committed */
+		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+					    list) {
+			if (t->in_commit) {
+				if (t->commit_done)
+					goto out_unlock;
+				cur_trans = t;
+				break;
+			}
+		}
+		if (!cur_trans)
+			goto out_unlock;  /* nothing committing|committed */
+	}
+
+	cur_trans->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	wait_for_commit(root, cur_trans);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	put_transaction(cur_trans);
+	ret = 0;
+out_unlock:
+	mutex_unlock(&root->fs_info->trans_mutex);
+	return ret;
+}
+
 #if 0
 /*
  * rate limit against the drop_snapshot code.  This helps to slow down new
@@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle)
+			  struct btrfs_root *root, int throttle, int lock)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 
-	if (!root->fs_info->open_ioctl_trans &&
+	if (lock && !root->fs_info->open_ioctl_trans &&
 	    should_end_transaction(trans, root))
 		trans->transaction->blocked = 1;
 
-	if (cur_trans->blocked && !cur_trans->in_commit) {
+	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
 		if (throttle)
 			return btrfs_commit_transaction(trans, root);
 		else
 			wake_up_process(info->transaction_kthread);
 	}
 
-	mutex_lock(&info->trans_mutex);
+	if (lock)
+		mutex_lock(&info->trans_mutex);
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
 
+	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 	put_transaction(cur_trans);
-	mutex_unlock(&info->trans_mutex);
+	if (lock)
+		mutex_unlock(&info->trans_mutex);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 0);
+	return __btrfs_end_transaction(trans, root, 0, 1);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1);
+	return __btrfs_end_transaction(trans, root, 1, 1);
+}
+
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 0, 0);
 }
 
 /*
@@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
-	int retries = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
@@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-					  to_reserve, &retries);
+					  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
@@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root = root_item->bytenr;
 	super->generation = root_item->generation;
 	super->root_level = root_item->level;
+	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+		super->cache_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 	return ret;
 }
 
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+					    struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->in_commit)
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->in_commit) {
+			finish_wait(&root->fs_info->transaction_blocked_wait,
+				    &wait);
+			break;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+	}
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+					 struct btrfs_transaction *trans)
+{
+	DEFINE_WAIT(wait);
+
+	if (trans->commit_done || (trans->in_commit && !trans->blocked))
+		return;
+
+	while (1) {
+		prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->commit_done ||
+		    (trans->in_commit && !trans->blocked)) {
+			finish_wait(&root->fs_info->transaction_wait,
+				    &wait);
+			break;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&root->fs_info->transaction_wait,
+			    &wait);
+	}
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+	struct btrfs_trans_handle *newtrans;
+	struct btrfs_root *root;
+	struct delayed_work work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+	struct btrfs_async_commit *ac =
+		container_of(work, struct btrfs_async_commit, work.work);
+
+	btrfs_commit_transaction(ac->newtrans, ac->root);
+	kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock)
+{
+	struct btrfs_async_commit *ac;
+	struct btrfs_transaction *cur_trans;
+
+	ac = kmalloc(sizeof(*ac), GFP_NOFS);
+	BUG_ON(!ac);
+
+	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	ac->root = root;
+	ac->newtrans = btrfs_join_transaction(root, 0);
+
+	/* take transaction reference */
+	mutex_lock(&root->fs_info->trans_mutex);
+	cur_trans = trans->transaction;
+	cur_trans->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	btrfs_end_transaction(trans, root);
+	schedule_delayed_work(&ac->work, 0);
+
+	/* wait for transaction to start and unblock */
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (wait_for_unblock)
+		wait_current_trans_commit_start_and_unblock(root, cur_trans);
+	else
+		wait_current_trans_commit_start(root, cur_trans);
+	put_transaction(cur_trans);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	return 0;
+}
+
+/*
+ * btrfs_transaction state sequence:
+ *    in_commit = 0, blocked = 0  (initial)
+ *    in_commit = 1, blocked = 1
+ *    blocked = 0
+ *    commit_done = 1
+ */
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
 	unsigned long joined = 0;
-	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	DEFINE_WAIT(wait);
@@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
@@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			snap_pending = 1;
 
 		WARN_ON(cur_trans != trans->transaction);
-		if (cur_trans->num_writers > 1)
-			timeout = MAX_SCHEDULE_TIMEOUT;
-		else if (should_grow)
-			timeout = 1;
-
 		mutex_unlock(&root->fs_info->trans_mutex);
 
 		if (flush_on_commit || snap_pending) {
@@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				TASK_UNINTERRUPTIBLE);
 
 		smp_mb();
-		if (cur_trans->num_writers > 1 || should_grow)
-			schedule_timeout(timeout);
+		if (cur_trans->num_writers > 1)
+			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+		else if (should_grow)
+			schedule_timeout(1);
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bf..f104b57ad4e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 						  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+							  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed..992ab425599 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int wret;
 	int level;
-	int orig_level;
 	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	level = btrfs_header_level(root->node);
-	orig_level = level;
 
 	if (level == 0)
 		goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9..a29f19384a2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 {
 	struct inode *dir;
 	int ret;
-	struct btrfs_key location;
 	struct btrfs_inode_ref *ref;
 	struct btrfs_dir_item *di;
 	struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	unsigned long ref_ptr;
 	unsigned long ref_end;
 
-	location.objectid = key->objectid;
-	location.type = BTRFS_INODE_ITEM_KEY;
-	location.offset = 0;
-
 	/*
 	 * it is possible that we didn't log all the parent directories
 	 * for a given inode.  If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	struct btrfs_path *path;
 	struct btrfs_root *root = wc->replay_dest;
 	struct btrfs_key key;
-	u32 item_size;
 	int level;
 	int i;
 	int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	nritems = btrfs_header_nritems(eb);
 	for (i = 0; i < nritems; i++) {
 		btrfs_item_key_to_cpu(eb, &key, i);
-		item_size = btrfs_item_size_nr(eb, i);
 
 		/* inode keys are done during the first stage */
 		if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct walk_control *wc)
 {
 	u64 root_owner;
-	u64 root_gen;
 	u64 bytenr;
 	u64 ptr_gen;
 	struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		parent = path->nodes[*level];
 		root_owner = btrfs_header_owner(parent);
-		root_gen = btrfs_header_generation(parent);
 
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct walk_control *wc)
 {
 	u64 root_owner;
-	u64 root_gen;
 	int i;
 	int slot;
 	int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-			struct extent_buffer *node;
-			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
 			WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				parent = path->nodes[*level + 1];
 
 			root_owner = btrfs_header_owner(parent);
-			root_gen = btrfs_header_generation(parent);
 			wc->process_func(root, path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]));
 			if (wc->free) {
@@ -2273,7 +2260,7 @@ fail:
 	}
 	btrfs_end_log_trans(root);
 
-	return 0;
+	return err;
 }
 
 /* see comments for btrfs_del_dir_entries_in_log */
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
-	u32 size;
 	int err = 0;
 	int ret;
 	int nritems;
@@ -2793,7 +2779,6 @@ again:
 			break;
 
 		src = path->nodes[0];
-		size = btrfs_item_size_nr(src, path->slots[0]);
 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
 			ins_nr++;
 			goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6e3af8be95..cc04dc1445d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -258,7 +258,7 @@ loop_lock:
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
-		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
+		if (cur->bi_rw & REQ_SYNC)
 			num_sync_run++;
 
 		submit_bio(cur->bi_rw, cur);
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->devid = orig_dev->devid;
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		INIT_LIST_HEAD(&device->dev_list);
 		INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	trans = btrfs_start_transaction(root, 0);
 	lock_chunks(root);
 
-	device->barriers = 1;
 	device->writeable = 1;
 	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
@@ -1901,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	u64 size_to_free;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_chunk *chunk;
 	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
@@ -1965,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		if (found_key.objectid != key.objectid)
 			break;
 
-		chunk = btrfs_item_ptr(path->nodes[0],
-				       path->slots[0],
-				       struct btrfs_chunk);
 		/* chunk zero is special */
 		if (found_key.offset == 0)
 			break;
@@ -2651,7 +2644,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW)))
+	if (multi_ret && !(rw & REQ_WRITE))
 		stripes_allocated = 1;
 again:
 	if (multi_ret) {
@@ -2687,7 +2680,7 @@ again:
 		mirror_num = 0;
 
 	/* if our multi bio struct is too small, back off and try again */
-	if (rw & (1 << BIO_RW)) {
+	if (rw & REQ_WRITE) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			stripes_required = map->num_stripes;
@@ -2697,7 +2690,7 @@ again:
 			max_errors = 1;
 		}
 	}
-	if (multi_ret && (rw & (1 << BIO_RW)) &&
+	if (multi_ret && (rw & REQ_WRITE) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
@@ -2733,7 +2726,7 @@ again:
 	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (unplug_page || (rw & (1 << BIO_RW)))
+		if (unplug_page || (rw & REQ_WRITE))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -2744,7 +2737,7 @@ again:
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (1 << BIO_RW))
+		if (rw & REQ_WRITE)
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -2755,7 +2748,7 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (unplug_page || (rw & (1 << BIO_RW)))
+		if (unplug_page || (rw & REQ_WRITE))
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
@@ -2945,7 +2938,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	struct btrfs_pending_bios *pending_bios;
 
 	/* don't bother with additional async steps for reads, right now */
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
 		submit_bio(rw, bio);
 		bio_put(bio);
@@ -2964,7 +2957,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
+	if (bio->bi_rw & REQ_SYNC)
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
@@ -3034,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
-		BUG_ON(rw == WRITE && !dev->writeable);
-		if (dev && dev->bdev) {
+		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
@@ -3084,7 +3076,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2e..2b638b6e4ee 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
 	int running_pending;
 	u64 generation;
 
-	int barriers;
 	int writeable;
 	int in_fs_metadata;
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb21587..698fdd2c739 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
-	struct btrfs_item *item;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		}
 		advance = 1;
 
-		item = btrfs_item_nr(leaf, slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		/* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa23..b9cd5445f71 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	int nr_pages = 0;
 	struct page *in_page = NULL;
 	struct page *out_page = NULL;
-	int out_written = 0;
-	int in_read = 0;
 	unsigned long bytes_left;
 
 	*out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
 	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
 
-	out_written = 0;
-	in_read = 0;
-
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
diff --git a/fs/buffer.c b/fs/buffer.c
index d54812b198e..5930e382959 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				spin_unlock(lock);
 				/*
 				 * Ensure any pending I/O completes so that
-				 * ll_rw_block() actually writes the current
-				 * contents - it is a noop if I/O is still in
-				 * flight on potentially older contents.
+				 * write_dirty_buffer() actually writes the
+				 * current contents - it is a noop if I/O is
+				 * still in flight on potentially older
+				 * contents.
 				 */
-				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -904,7 +905,6 @@ try_again:
 
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
-		bh->b_private = NULL;
 		bh->b_size = size;
 
 		/* Link the buffer to its page */
@@ -1705,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		 * and kswapd activity, but those code paths have their own
 		 * higher-level throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
@@ -1833,9 +1833,12 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-static int __block_prepare_write(struct inode *inode, struct page *page,
-		unsigned from, unsigned to, get_block_t *get_block)
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
 {
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end;
 	sector_t block;
 	int err = 0;
@@ -1908,10 +1911,13 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
-	if (unlikely(err))
+	if (unlikely(err)) {
 		page_zero_new_buffers(page, from, to);
+		ClearPageUptodate(page);
+	}
 	return err;
 }
+EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
@@ -1949,90 +1955,32 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 
 /*
- * Filesystems implementing the new truncate sequence should use the
- * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
  * The filesystem needs to handle block truncation upon failure.
  */
-int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+		unsigned flags, struct page **pagep, get_block_t *get_block)
 {
-	struct inode *inode = mapping->host;
-	int status = 0;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
-	pgoff_t index;
-	unsigned start, end;
-	int ownpage = 0;
+	int status;
 
-	index = pos >> PAGE_CACHE_SHIFT;
-	start = pos & (PAGE_CACHE_SIZE - 1);
-	end = start + len;
-
-	page = *pagep;
-	if (page == NULL) {
-		ownpage = 1;
-		page = grab_cache_page_write_begin(mapping, index, flags);
-		if (!page) {
-			status = -ENOMEM;
-			goto out;
-		}
-		*pagep = page;
-	} else
-		BUG_ON(!PageLocked(page));
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
 
-	status = __block_prepare_write(inode, page, start, end, get_block);
+	status = __block_write_begin(page, pos, len, get_block);
 	if (unlikely(status)) {
-		ClearPageUptodate(page);
-
-		if (ownpage) {
-			unlock_page(page);
-			page_cache_release(page);
-			*pagep = NULL;
-		}
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
 	}
 
-out:
+	*pagep = page;
 	return status;
 }
-EXPORT_SYMBOL(block_write_begin_newtrunc);
-
-/*
- * block_write_begin takes care of the basic task of block allocation and
- * bringing partial write blocks uptodate first.
- *
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
- */
-int block_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
-{
-	int ret;
-
-	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block);
-
-	/*
-	 * prepare_write() may have instantiated a few blocks
-	 * outside i_size.  Trim these off again. Don't need
-	 * i_size_read because we hold i_mutex.
-	 *
-	 * Filesystems which pass down their own page also cannot
-	 * call into vmtruncate here because it would lead to lock
-	 * inversion problems (*pagep is locked). This is a further
-	 * example of where the old truncate sequence is inadequate.
-	 */
-	if (unlikely(ret) && *pagep == NULL) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
-}
 EXPORT_SYMBOL(block_write_begin);
 
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2351,7 +2299,7 @@ out:
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
-int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
+int cont_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
@@ -2363,7 +2311,7 @@ int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 
 	err = cont_expand_zero(file, mapping, pos, bytes);
 	if (err)
-		goto out;
+		return err;
 
 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
@@ -2371,44 +2319,10 @@ int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 		(*bytes)++;
 	}
 
-	*pagep = NULL;
-	err = block_write_begin_newtrunc(file, mapping, pos, len,
-				flags, pagep, fsdata, get_block);
-out:
-	return err;
-}
-EXPORT_SYMBOL(cont_write_begin_newtrunc);
-
-int cont_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block, loff_t *bytes)
-{
-	int ret;
-
-	ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block, bytes);
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
+	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
 }
 EXPORT_SYMBOL(cont_write_begin);
 
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
-			get_block_t *get_block)
-{
-	struct inode *inode = page->mapping->host;
-	int err = __block_prepare_write(inode, page, from, to, get_block);
-	if (err)
-		ClearPageUptodate(page);
-	return err;
-}
-EXPORT_SYMBOL(block_prepare_write);
-
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
@@ -2457,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	else
 		end = PAGE_CACHE_SIZE;
 
-	ret = block_prepare_write(page, 0, end, get_block);
+	ret = __block_write_begin(page, 0, end, get_block);
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
@@ -2510,11 +2424,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 
 /*
- * Filesystems implementing the new truncate sequence should use the
- * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
  * The filesystem needs to handle block truncation upon failure.
  */
-int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
+int nobh_write_begin(struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -2544,11 +2458,10 @@ int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 	*fsdata = NULL;
 
 	if (page_has_buffers(page)) {
-		unlock_page(page);
-		page_cache_release(page);
-		*pagep = NULL;
-		return block_write_begin_newtrunc(file, mapping, pos, len,
-					flags, pagep, fsdata, get_block);
+		ret = __block_write_begin(page, pos, len, get_block);
+		if (unlikely(ret))
+			goto out_release;
+		return ret;
 	}
 
 	if (PageMappedToDisk(page))
@@ -2654,35 +2567,6 @@ out_release:
 
 	return ret;
 }
-EXPORT_SYMBOL(nobh_write_begin_newtrunc);
-
-/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
- */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
-{
-	int ret;
-
-	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block);
-
-	/*
-	 * prepare_write() may have instantiated a few blocks
-	 * outside i_size.  Trim these off again. Don't need
-	 * i_size_read because we hold i_mutex.
-	 */
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
-}
 EXPORT_SYMBOL(nobh_write_begin);
 
 int nobh_write_end(struct file *file, struct address_space *mapping,
@@ -2998,7 +2882,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3020,13 +2903,6 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(buffer_unwritten(bh));
 
 	/*
-	 * Mask in barrier bit for a write (could be either a WRITE or a
-	 * WRITE_SYNC
-	 */
-	if (buffer_ordered(bh) && (rw & WRITE))
-		rw |= WRITE_BARRIER;
-
-	/*
 	 * Only clear out a write error when rewriting
 	 */
 	if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -3064,22 +2940,21 @@ EXPORT_SYMBOL(submit_bh);
 
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
- * are sent to disk. The fourth %READA option is described in the documentation
- * for generic_make_request() which ll_rw_block() calls.
+ * %READA option is described in the documentation for generic_make_request()
+ * which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
- * clean when doing a write request, and any buffer that appears to be
- * up-to-date when doing read request.  Further it marks as clean buffers that
- * are processed for writing (the buffer cache won't assume that they are
- * actually clean until the buffer gets unlocked).
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
+ * request, and any buffer that appears to be up-to-date when doing read
+ * request.  Further it marks as clean buffers that are processed for
+ * writing (the buffer cache won't assume that they are actually clean
+ * until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -3095,20 +2970,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
-			lock_buffer(bh);
-		else if (!trylock_buffer(bh))
+		if (!trylock_buffer(bh))
 			continue;
-
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
-		    rw == SWRITE_SYNC_PLUG) {
+		if (rw == WRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-				if (rw == SWRITE_SYNC)
-					submit_bh(WRITE_SYNC, bh);
-				else
-					submit_bh(WRITE, bh);
+				submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
@@ -3124,12 +2992,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
 
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	lock_buffer(bh);
+	if (!test_clear_buffer_dirty(bh)) {
+		unlock_buffer(bh);
+		return;
+	}
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
+
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
  * and then start new I/O and then wait upon it.  The caller must have a ref on
  * the buffer_head.
  */
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 {
 	int ret = 0;
 
@@ -3138,12 +3019,8 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(rw, bh);
 		wait_on_buffer(bh);
-		if (buffer_eopnotsupp(bh)) {
-			clear_buffer_eopnotsupp(bh);
-			ret = -EOPNOTSUPP;
-		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
@@ -3151,6 +3028,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	}
 	return ret;
 }
+EXPORT_SYMBOL(__sync_dirty_buffer);
+
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+	return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
 EXPORT_SYMBOL(sync_dirty_buffer);
 
 /*
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 2906077ac79..a2603e7c0bb 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -146,7 +146,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 		goto error_unsupported;
 
 	/* get the cache size and blocksize */
-	ret = vfs_statfs(root, &stats);
+	ret = vfs_statfs(&path, &stats);
 	if (ret < 0)
 		goto error_unsupported;
 
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index c2413561ea7..0a1467b1551 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
 	.read		= cachefiles_daemon_read,
 	.write		= cachefiles_daemon_write,
 	.poll		= cachefiles_daemon_poll,
+	.llseek		= noop_llseek,
 };
 
 struct cachefiles_daemon_cmd {
@@ -552,8 +553,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
  */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
-	struct dentry *dir;
+	struct path path;
 	const struct cred *saved_cred;
 	int ret;
 
@@ -573,24 +573,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	dir = dget(fs->pwd.dentry);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(dir->d_inode->i_mode))
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = cachefiles_cull(cache, dir, args);
+	ret = cachefiles_cull(cache, path.dentry, args);
 	cachefiles_end_secure(cache, saved_cred);
 
-	dput(dir);
+	path_put(&path);
 	_leave(" = %d", ret);
 	return ret;
 
 notdir:
-	dput(dir);
+	path_put(&path);
 	kerror("cull command requires dirfd to be a directory");
 	return -ENOTDIR;
 
@@ -628,8 +625,7 @@ inval:
  */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
-	struct dentry *dir;
+	struct path path;
 	const struct cred *saved_cred;
 	int ret;
 
@@ -649,24 +645,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	dir = dget(fs->pwd.dentry);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
-	if (!S_ISDIR(dir->d_inode->i_mode))
+	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = cachefiles_check_in_use(cache, dir, args);
+	ret = cachefiles_check_in_use(cache, path.dentry, args);
 	cachefiles_end_secure(cache, saved_cred);
 
-	dput(dir);
+	path_put(&path);
 	//_leave(" = %d", ret);
 	return ret;
 
 notdir:
-	dput(dir);
+	path_put(&path);
 	kerror("inuse command requires dirfd to be a directory");
 	return -ENOTDIR;
 
@@ -683,6 +676,10 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
 			 unsigned fnr, unsigned bnr)
 {
 	struct kstatfs stats;
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= cache->mnt->mnt_root,
+	};
 	int ret;
 
 	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
@@ -697,7 +694,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
 	/* find out how many pages of blockdev are available */
 	memset(&stats, 0, sizeof(stats));
 
-	ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+	ret = vfs_statfs(&path, &stats);
 	if (ret < 0) {
 		if (ret == -EIO)
 			cachefiles_io_error(cache, "statfs failed");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226d..bd6bc1bde2d 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do {									\
 #define dbgprintk(FMT, ...) \
 	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline void _dbprintk(const char *fmt, ...)
-	__attribute__((format(printf, 1, 2)));
-static inline void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 
 #if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42..42c7fafc8bf 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
 	       prefix, fscache_object_states[object->fscache.state],
-	       object->fscache.flags, object->fscache.work.flags,
+	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events,
 	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
 
 		/* if the object we're waiting for is queued for processing,
 		 * then just put ourselves on the queue behind it */
-		if (slow_work_is_queued(&xobject->fscache.work)) {
+		if (work_pending(&xobject->fscache.work)) {
 			_debug("queue OBJ%x behind OBJ%x immediately",
 			       object->fscache.debug_id,
 			       xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
 		}
 
 		/* otherwise we sleep until either the object we're waiting for
-		 * is done, or the slow-work facility wants the thread back to
-		 * do other work */
+		 * is done, or the fscache_object is congested */
 		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
 		init_wait(&wait);
 		requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
 			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
 			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
 				break;
-			requeue = slow_work_sleep_till_thread_needed(
-				&object->fscache.work, &timeout);
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
 		} while (timeout > 0 && !requeue);
 		finish_wait(wq, &wait);
 
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03..0e3c0924cc3 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	pagevec_init(&pagevec, 0);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	INIT_LIST_HEAD(&backpages);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27..9eb134ea6eb 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,8 +1,11 @@
 config CEPH_FS
         tristate "Ceph distributed file system (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
+	select CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
+	select CRYPTO
+	default n
 	help
 	  Choose Y or M here to include support for mounting the
 	  experimental Ceph distributed file system.  Ceph is an extremely
@@ -13,15 +16,3 @@ config CEPH_FS
 
 	  If unsure, say N.
 
-config CEPH_FS_PRETTYDEBUG
-	bool "Include file:line in ceph debug output"
-	depends on CEPH_FS
-	default n
-	help
-	  If you say Y here, debug output will include a filename and
-	  line to aid debugging.  This icnreases kernel size and slows
-	  execution slightly when debug call sites are enabled (e.g.,
-	  via CONFIG_DYNAMIC_DEBUG).
-
-	  If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be..9e6c4f2e8ff 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,17 +6,10 @@ ifneq ($(KERNELRELEASE),)
 
 obj-$(CONFIG_CEPH_FS) += ceph.o
 
-ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o \
-	messenger.o msgpool.o buffer.o pagelist.o \
-	mds_client.o mdsmap.o \
-	mon_client.o \
-	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-	debugfs.o \
-	auth.o auth_none.o \
-	crypto.o armor.o \
-	auth_x.o \
-	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+	mds_client.o mdsmap.o strings.o ceph_frag.o \
+	debugfs.o
 
 else
 #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c..00000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
-src/include/msgr.h	    fs/ceph/msgr.h
-src/include/rados.h	    fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c	    fs/ceph/crush/crush.c
-src/crush/crush.h	    fs/ceph/crush/crush.h
-src/crush/mapper.c	    fs/ceph/crush/mapper.c
-src/crush/mapper.h	    fs/ceph/crush/mapper.h
-src/crush/hash.h	    fs/ceph/crush/hash.h
-src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d9c60b84949..e9c874abc9e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
@@ -10,7 +10,8 @@
 #include <linux/task_io_accounting_ops.h>
 
 #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
 
 /*
  * Ceph address space ops.
@@ -87,7 +88,7 @@ static int ceph_set_page_dirty(struct page *page)
 
 	/* dirty the head */
 	spin_lock(&inode->i_lock);
-	if (ci->i_wrbuffer_ref_head == 0)
+	if (ci->i_head_snapc == NULL)
 		ci->i_head_snapc = ceph_get_snap_context(snapc);
 	++ci->i_wrbuffer_ref_head;
 	if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +106,7 @@ static int ceph_set_page_dirty(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(!PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, page->mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 
@@ -199,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc = 
+		&ceph_inode_to_client(inode)->client->osdc;
 	int err = 0;
 	u64 len = PAGE_CACHE_SIZE;
 
@@ -271,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_inode_to_client(inode)->client->osdc;
 	int rc = 0;
 	struct page **pages;
 	loff_t offset;
@@ -309,7 +306,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 			zero_user_segment(page, s, PAGE_CACHE_SIZE);
 		}
 
-		if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, page->index,
+					  GFP_NOFS)) {
 			page_cache_release(page);
 			dout("readpages %p add_to_page_cache failed %p\n",
 			     inode, page);
@@ -351,7 +349,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 			break;
 		}
 	}
-	if (!snapc && ci->i_head_snapc) {
+	if (!snapc && ci->i_wrbuffer_ref_head) {
 		snapc = ceph_get_snap_context(ci->i_head_snapc);
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
@@ -370,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
 	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
 	int len = PAGE_CACHE_SIZE;
@@ -388,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
-	client = ceph_inode_to_client(inode);
-	osdc = &client->osdc;
+	fsc = ceph_inode_to_client(inode);
+	osdc = &fsc->client->osdc;
 
 	/* verify this is a writeable snap context */
 	snapc = (void *)page->private;
@@ -416,13 +414,13 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	if (i_size < page_off + len)
 		len = i_size - page_off;
 
-	dout("writepage %p page %p index %lu on %llu~%u\n",
-	     inode, page, page->index, page_off, len);
+	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
+	     inode, page, page->index, page_off, len, snapc);
 
-	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
 	if (writeback_stat >
-	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
-		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -501,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct address_space *mapping = inode->i_mapping;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
@@ -534,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
 		WARN_ON(!PageUptodate(page));
 
 		writeback_stat =
-			atomic_long_dec_return(&client->writeback_count);
+			atomic_long_dec_return(&fsc->writeback_count);
 		if (writeback_stat <
-		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
-			clear_bdi_congested(&client->backing_dev_info,
+		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+			clear_bdi_congested(&fsc->backing_dev_info,
 					    BLK_RW_ASYNC);
 
 		ceph_put_snap_context((void *)page->private);
@@ -552,7 +550,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * page truncation thread, possibly losing some data that
 		 * raced its way in
 		 */
-		if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
 			generic_error_remove_page(inode->i_mapping, page);
 
 		unlock_page(page);
@@ -574,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
  * mempool.  we avoid the mempool if we can because req->r_num_pages
  * may be less than the maximum write size.
  */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
 	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
 			       GFP_NOFS);
 	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
 		req->r_pages_from_pool = 1;
 		WARN_ON(!req->r_pages);
 	}
@@ -593,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 				 struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
@@ -622,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	client = ceph_inode_to_client(inode);
-	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+	fsc = ceph_inode_to_client(inode);
+	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
 		pr_warning("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
-	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
-		wsize = client->mount_args->wsize;
+	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+		wsize = fsc->mount_options->wsize;
 	if (wsize < PAGE_CACHE_SIZE)
 		wsize = PAGE_CACHE_SIZE;
 	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&pvec, 0);
 
-	/* ?? */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		dout(" writepages congested\n");
-		wbc->encountered_congestion = 1;
-		goto out_final;
-	}
-
 	/* where to start/end? */
 	if (wbc->range_cyclic) {
 		start = mapping->writeback_index; /* Start from prev offset */
@@ -771,9 +761,10 @@ get_more_pages:
 			/* ok */
 			if (locked_pages == 0) {
 				/* prepare async write request */
-				offset = page->index << PAGE_CACHE_SHIFT;
+				offset = (unsigned long long)page->index
+					<< PAGE_CACHE_SHIFT;
 				len = wsize;
-				req = ceph_osdc_new_request(&client->osdc,
+				req = ceph_osdc_new_request(&fsc->client->osdc,
 					    &ci->i_layout,
 					    ceph_vino(inode),
 					    offset, &len,
@@ -786,7 +777,7 @@ get_more_pages:
 					    &inode->i_mtime, true, 1);
 				max_pages = req->r_num_pages;
 
-				alloc_page_vec(client, req);
+				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
 			}
@@ -797,9 +788,12 @@ get_more_pages:
 			dout("%p will write page %p idx %lu\n",
 			     inode, page, page->index);
 
-			writeback_stat = atomic_long_inc_return(&client->writeback_count);
-			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
-				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+			writeback_stat =
+			       atomic_long_inc_return(&fsc->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(
+				    fsc->mount_options->congestion_kb)) {
+				set_bdi_congested(&fsc->backing_dev_info,
+						  BLK_RW_ASYNC);
 			}
 
 			set_page_writeback(page);
@@ -847,7 +841,7 @@ get_more_pages:
 		op->payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
 
-		ceph_osdc_start_request(&client->osdc, req, true);
+		ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		req = NULL;
 
 		/* continue? */
@@ -883,7 +877,6 @@ out:
 		rc = 0;  /* vfs expects us to return 0 */
 	ceph_put_snap_context(snapc);
 	dout("writepages done, rc = %d\n", rc);
-out_final:
 	return rc;
 }
 
@@ -916,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
@@ -1036,7 +1029,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 		*pagep = page;
 
 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
-	     	inode, page, (int)pos, (int)len);
+		     inode, page, (int)pos, (int)len);
 
 		r = ceph_update_writeable_page(file, pos, len, page);
 	} while (r == -EAGAIN);
@@ -1054,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
 
@@ -1124,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page = vmf->page;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t off = page->index << PAGE_CACHE_SHIFT;
 	loff_t size, len;
 	int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index 67b2c030924..00000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,99 +0,0 @@
-
-#include <linux/errno.h>
-
-/*
- * base64 encode/decode.
- */
-
-const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-static int encode_bits(int c)
-{
-	return pem_key[c];
-}
-
-static int decode_bits(char c)
-{
-	if (c >= 'A' && c <= 'Z')
-		return c - 'A';
-	if (c >= 'a' && c <= 'z')
-		return c - 'a' + 26;
-	if (c >= '0' && c <= '9')
-		return c - '0' + 52;
-	if (c == '+')
-		return 62;
-	if (c == '/')
-		return 63;
-	if (c == '=')
-		return 0; /* just non-negative, please */
-	return -EINVAL;
-}
-
-int ceph_armor(char *dst, const char *src, const char *end)
-{
-	int olen = 0;
-	int line = 0;
-
-	while (src < end) {
-		unsigned char a, b, c;
-
-		a = *src++;
-		*dst++ = encode_bits(a >> 2);
-		if (src < end) {
-			b = *src++;
-			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
-			if (src < end) {
-				c = *src++;
-				*dst++ = encode_bits(((b & 15) << 2) |
-						     (c >> 6));
-				*dst++ = encode_bits(c & 63);
-			} else {
-				*dst++ = encode_bits((b & 15) << 2);
-				*dst++ = '=';
-			}
-		} else {
-			*dst++ = encode_bits(((a & 3) << 4));
-			*dst++ = '=';
-			*dst++ = '=';
-		}
-		olen += 4;
-		line += 4;
-		if (line == 64) {
-			line = 0;
-			*(dst++) = '\n';
-			olen++;
-		}
-	}
-	return olen;
-}
-
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
-	int olen = 0;
-
-	while (src < end) {
-		int a, b, c, d;
-
-		if (src < end && src[0] == '\n')
-			src++;
-		if (src + 4 > end)
-			return -EINVAL;
-		a = decode_bits(src[0]);
-		b = decode_bits(src[1]);
-		c = decode_bits(src[2]);
-		d = decode_bits(src[3]);
-		if (a < 0 || b < 0 || c < 0 || d < 0)
-			return -EINVAL;
-
-		*dst++ = (a << 2) | (b >> 4);
-		if (src[2] == '=')
-			return olen + 1;
-		*dst++ = ((b & 15) << 4) | (c >> 2);
-		if (src[3] == '=')
-			return olen + 2;
-		*dst++ = ((c & 3) << 6) | d;
-		olen += 3;
-		src += 4;
-	}
-	return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 89490beaf53..00000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-
-#include "messenger.h"
-
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
-	CEPH_AUTH_NONE,
-	CEPH_AUTH_CEPHX
-};
-
-int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
-	switch (protocol) {
-	case CEPH_AUTH_NONE:
-		return ceph_auth_none_init(ac);
-	case CEPH_AUTH_CEPHX:
-		return ceph_x_init(ac);
-	default:
-		return -ENOENT;
-	}
-}
-
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
-	struct ceph_auth_client *ac;
-	int ret;
-
-	dout("auth_init name '%s' secret '%s'\n", name, secret);
-
-	ret = -ENOMEM;
-	ac = kzalloc(sizeof(*ac), GFP_NOFS);
-	if (!ac)
-		goto out;
-
-	ac->negotiating = true;
-	if (name)
-		ac->name = name;
-	else
-		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s secret %s\n", ac->name, secret);
-	ac->secret = secret;
-	return ac;
-
-out:
-	return ERR_PTR(ret);
-}
-
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
-	dout("auth_destroy %p\n", ac);
-	if (ac->ops)
-		ac->ops->destroy(ac);
-	kfree(ac);
-}
-
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
-	dout("auth_reset %p\n", ac);
-	if (ac->ops && !ac->negotiating)
-		ac->ops->reset(ac);
-	ac->negotiating = true;
-}
-
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
-	int len = strlen(name);
-
-	if (*p + 2*sizeof(u32) + len > end)
-		return -ERANGE;
-	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
-	ceph_encode_32(p, len);
-	ceph_encode_copy(p, name, len);
-	return 0;
-}
-
-/*
- * Initiate protocol negotiation with monitor.  Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
-	struct ceph_mon_request_header *monhdr = buf;
-	void *p = monhdr + 1, *end = buf + len, *lenp;
-	int i, num;
-	int ret;
-
-	dout("auth_build_hello\n");
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, 0);  /* no protocol, yet */
-
-	lenp = p;
-	p += sizeof(u32);
-
-	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	ceph_encode_8(&p, 1);
-	num = ARRAY_SIZE(supported_protocols);
-	ceph_encode_32(&p, num);
-	ceph_decode_need(&p, end, num * sizeof(u32), bad);
-	for (i = 0; i < num; i++)
-		ceph_encode_32(&p, supported_protocols[i]);
-
-	ret = ceph_entity_name_encode(ac->name, &p, end);
-	if (ret < 0)
-		return ret;
-	ceph_decode_need(&p, end, sizeof(u64), bad);
-	ceph_encode_64(&p, ac->global_id);
-
-	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-	return p - buf;
-
-bad:
-	return -ERANGE;
-}
-
-int ceph_build_auth_request(struct ceph_auth_client *ac,
-			   void *msg_buf, size_t msg_len)
-{
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
-	int ret;
-
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
-
-	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		return ret;
-	}
-	dout(" built request %d bytes\n", ret);
-	ceph_encode_32(&p, ret);
-	return p + ret - msg_buf;
-}
-
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-			   void *buf, size_t len,
-			   void *reply_buf, size_t reply_len)
-{
-	void *p = buf;
-	void *end = buf + len;
-	int protocol;
-	s32 result;
-	u64 global_id;
-	void *payload, *payload_end;
-	int payload_len;
-	char *result_msg;
-	int result_msg_len;
-	int ret = -EINVAL;
-
-	dout("handle_auth_reply %p %p\n", p, end);
-	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
-	protocol = ceph_decode_32(&p);
-	result = ceph_decode_32(&p);
-	global_id = ceph_decode_64(&p);
-	payload_len = ceph_decode_32(&p);
-	payload = p;
-	p += payload_len;
-	ceph_decode_need(&p, end, sizeof(u32), bad);
-	result_msg_len = ceph_decode_32(&p);
-	result_msg = p;
-	p += result_msg_len;
-	if (p != end)
-		goto bad;
-
-	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
-	     result_msg, global_id, payload_len);
-
-	payload_end = payload + payload_len;
-
-	if (global_id && ac->global_id != global_id) {
-		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-		ac->global_id = global_id;
-	}
-
-	if (ac->negotiating) {
-		/* server does not support our protocols? */
-		if (!protocol && result < 0) {
-			ret = result;
-			goto out;
-		}
-		/* set up (new) protocol handler? */
-		if (ac->protocol && ac->protocol != protocol) {
-			ac->ops->destroy(ac);
-			ac->protocol = 0;
-			ac->ops = NULL;
-		}
-		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
-			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
-				goto out;
-			}
-		}
-
-		ac->negotiating = false;
-	}
-
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-	if (ret == -EAGAIN) {
-		return ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-		return ret;
-	}
-	return 0;
-
-bad:
-	pr_err("failed to decode auth msg\n");
-out:
-	return ret;
-}
-
-int ceph_build_auth(struct ceph_auth_client *ac,
-		    void *msg_buf, size_t msg_len)
-{
-	if (!ac->protocol)
-		return ceph_auth_build_hello(ac, msg_buf, msg_len);
-	BUG_ON(!ac->ops);
-	if (ac->ops->should_authenticate(ac))
-		return ceph_build_auth_request(ac, msg_buf, msg_len);
-	return 0;
-}
-
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
-	if (!ac->ops)
-		return 0;
-	return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a13..00000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys.  These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-
-struct ceph_auth_client;
-struct ceph_authorizer;
-
-struct ceph_auth_client_ops {
-	const char *name;
-
-	/*
-	 * true if we are authenticated and can connect to
-	 * services.
-	 */
-	int (*is_authenticated)(struct ceph_auth_client *ac);
-
-	/*
-	 * true if we should (re)authenticate, e.g., when our tickets
-	 * are getting old and crusty.
-	 */
-	int (*should_authenticate)(struct ceph_auth_client *ac);
-
-	/*
-	 * build requests and process replies during monitor
-	 * handshake.  if handle_reply returns -EAGAIN, we build
-	 * another request.
-	 */
-	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-	int (*handle_reply)(struct ceph_auth_client *ac, int result,
-			    void *buf, void *end);
-
-	/*
-	 * Create authorizer for connecting to a service, and verify
-	 * the response to authenticate the service.
-	 */
-	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-				 struct ceph_authorizer **a,
-				 void **buf, size_t *len,
-				 void **reply_buf, size_t *reply_len);
-	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-				       struct ceph_authorizer *a, size_t len);
-	void (*destroy_authorizer)(struct ceph_auth_client *ac,
-				   struct ceph_authorizer *a);
-	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
-				      int peer_type);
-
-	/* reset when we (re)connect to a monitor */
-	void (*reset)(struct ceph_auth_client *ac);
-
-	void (*destroy)(struct ceph_auth_client *ac);
-};
-
-struct ceph_auth_client {
-	u32 protocol;           /* CEPH_AUTH_* */
-	void *private;          /* for use by protocol implementation */
-	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
-
-	bool negotiating;       /* true if negotiating protocol */
-	const char *name;       /* entity name */
-	u64 global_id;          /* our unique id in system */
-	const char *secret;     /* our secret key */
-	unsigned want_keys;     /* which services we want */
-};
-
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-					       const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
-				 void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-				  void *buf, size_t len,
-				  void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-
-extern int ceph_build_auth(struct ceph_auth_client *ac,
-		    void *msg_buf, size_t msg_len);
-
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c..00000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-
-static void reset(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	xi->starting = true;
-	xi->built_authorizer = false;
-}
-
-static void destroy(struct ceph_auth_client *ac)
-{
-	kfree(ac->private);
-	ac->private = NULL;
-}
-
-static int is_authenticated(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	return !xi->starting;
-}
-
-static int should_authenticate(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	return xi->starting;
-}
-
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
-{
-	struct ceph_auth_none_info *xi = ac->private;
-
-	xi->starting = false;
-	return result;
-}
-
-/*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
-	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
-{
-	struct ceph_auth_none_info *ai = ac->private;
-	struct ceph_none_authorizer *au = &ai->au;
-	void *p, *end;
-	int ret;
-
-	if (!ai->built_authorizer) {
-		p = au->buf;
-		end = p + sizeof(au->buf);
-		ceph_encode_8(&p, 1);
-		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-		if (ret < 0)
-			goto bad;
-		ceph_decode_need(&p, end, sizeof(u64), bad2);
-		ceph_encode_64(&p, ac->global_id);
-		au->buf_len = p - (void *)au->buf;
-		ai->built_authorizer = true;
-		dout("built authorizer len %d\n", au->buf_len);
-	}
-
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf;
-	*len = au->buf_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
-	return 0;
-
-bad2:
-	ret = -ERANGE;
-bad:
-	return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	/* nothing to do */
-}
-
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
-	.reset = reset,
-	.destroy = destroy,
-	.is_authenticated = is_authenticated,
-	.should_authenticate = should_authenticate,
-	.handle_reply = handle_reply,
-	.create_authorizer = ceph_auth_none_create_authorizer,
-	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
-	struct ceph_auth_none_info *xi;
-
-	dout("ceph_auth_none_init %p\n", ac);
-	xi = kzalloc(sizeof(*xi), GFP_NOFS);
-	if (!xi)
-		return -ENOMEM;
-
-	xi->starting = true;
-	xi->built_authorizer = false;
-
-	ac->protocol = CEPH_AUTH_NONE;
-	ac->private = xi;
-	ac->ops = &ceph_auth_none_ops;
-	return 0;
-}
-
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08b..00000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-
-#include <linux/slab.h>
-
-#include "auth.h"
-
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-
-struct ceph_none_authorizer {
-	char buf[128];
-	int buf_len;
-	char reply_buf[0];
-};
-
-struct ceph_auth_none_info {
-	bool starting;
-	bool built_authorizer;
-	struct ceph_none_authorizer au;   /* we only need one; it's static */
-};
-
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index 6d44053ecff..00000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,684 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-
-#define TEMP_TICKET_BUF_LEN	256
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-
-	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-
-	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return need != 0;
-}
-
-static int ceph_x_encrypt_buflen(int ilen)
-{
-	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
-		sizeof(u32);
-}
-
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
-			  void *ibuf, int ilen, void *obuf, size_t olen)
-{
-	struct ceph_x_encrypt_header head = {
-		.struct_v = 1,
-		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
-	};
-	size_t len = olen - sizeof(u32);
-	int ret;
-
-	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
-			    &head, sizeof(head), ibuf, ilen);
-	if (ret)
-		return ret;
-	ceph_encode_32(&obuf, len);
-	return len + sizeof(u32);
-}
-
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
-			  void **p, void *end, void *obuf, size_t olen)
-{
-	struct ceph_x_encrypt_header head;
-	size_t head_len = sizeof(head);
-	int len, ret;
-
-	len = ceph_decode_32(p);
-	if (*p + len > end)
-		return -EINVAL;
-
-	dout("ceph_x_decrypt len %d\n", len);
-	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
-			    *p, len);
-	if (ret)
-		return ret;
-	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
-		return -EPERM;
-	*p += len;
-	return olen;
-}
-
-/*
- * get existing (or insert new) ticket handler
- */
-struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
-						 int service)
-{
-	struct ceph_x_ticket_handler *th;
-	struct ceph_x_info *xi = ac->private;
-	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-
-	while (*p) {
-		parent = *p;
-		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
-		if (service < th->service)
-			p = &(*p)->rb_left;
-		else if (service > th->service)
-			p = &(*p)->rb_right;
-		else
-			return th;
-	}
-
-	/* add it */
-	th = kzalloc(sizeof(*th), GFP_NOFS);
-	if (!th)
-		return ERR_PTR(-ENOMEM);
-	th->service = service;
-	rb_link_node(&th->node, parent, p);
-	rb_insert_color(&th->node, &xi->ticket_handlers);
-	return th;
-}
-
-static void remove_ticket_handler(struct ceph_auth_client *ac,
-				  struct ceph_x_ticket_handler *th)
-{
-	struct ceph_x_info *xi = ac->private;
-
-	dout("remove_ticket_handler %p %d\n", th, th->service);
-	rb_erase(&th->node, &xi->ticket_handlers);
-	ceph_crypto_key_destroy(&th->session_key);
-	if (th->ticket_blob)
-		ceph_buffer_put(th->ticket_blob);
-	kfree(th);
-}
-
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
-				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	int num;
-	void *p = buf;
-	int ret;
-	char *dbuf;
-	char *ticket_buf;
-	u8 reply_struct_v;
-
-	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-	if (!dbuf)
-		return -ENOMEM;
-
-	ret = -ENOMEM;
-	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-	if (!ticket_buf)
-		goto out_dbuf;
-
-	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-	reply_struct_v = ceph_decode_8(&p);
-	if (reply_struct_v != 1)
-		goto bad;
-	num = ceph_decode_32(&p);
-	dout("%d tickets\n", num);
-	while (num--) {
-		int type;
-		u8 tkt_struct_v, blob_struct_v;
-		struct ceph_x_ticket_handler *th;
-		void *dp, *dend;
-		int dlen;
-		char is_enc;
-		struct timespec validity;
-		struct ceph_crypto_key old_key;
-		void *tp, *tpend;
-		struct ceph_timespec new_validity;
-		struct ceph_crypto_key new_session_key;
-		struct ceph_buffer *new_ticket_blob;
-		unsigned long new_expires, new_renew_after;
-		u64 new_secret_id;
-
-		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-
-		type = ceph_decode_32(&p);
-		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-
-		tkt_struct_v = ceph_decode_8(&p);
-		if (tkt_struct_v != 1)
-			goto bad;
-
-		th = get_ticket_handler(ac, type);
-		if (IS_ERR(th)) {
-			ret = PTR_ERR(th);
-			goto out;
-		}
-
-		/* blob for me */
-		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
-				      TEMP_TICKET_BUF_LEN);
-		if (dlen <= 0) {
-			ret = dlen;
-			goto out;
-		}
-		dout(" decrypted %d bytes\n", dlen);
-		dend = dbuf + dlen;
-		dp = dbuf;
-
-		tkt_struct_v = ceph_decode_8(&dp);
-		if (tkt_struct_v != 1)
-			goto bad;
-
-		memcpy(&old_key, &th->session_key, sizeof(old_key));
-		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
-		if (ret)
-			goto out;
-
-		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-		ceph_decode_timespec(&validity, &new_validity);
-		new_expires = get_seconds() + validity.tv_sec;
-		new_renew_after = new_expires - (validity.tv_sec / 4);
-		dout(" expires=%lu renew_after=%lu\n", new_expires,
-		     new_renew_after);
-
-		/* ticket blob for service */
-		ceph_decode_8_safe(&p, end, is_enc, bad);
-		tp = ticket_buf;
-		if (is_enc) {
-			/* encrypted */
-			dout(" encrypted ticket\n");
-			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
-					      TEMP_TICKET_BUF_LEN);
-			if (dlen < 0) {
-				ret = dlen;
-				goto out;
-			}
-			dlen = ceph_decode_32(&tp);
-		} else {
-			/* unencrypted */
-			ceph_decode_32_safe(&p, end, dlen, bad);
-			ceph_decode_need(&p, end, dlen, bad);
-			ceph_decode_copy(&p, ticket_buf, dlen);
-		}
-		tpend = tp + dlen;
-		dout(" ticket blob is %d bytes\n", dlen);
-		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-		blob_struct_v = ceph_decode_8(&tp);
-		new_secret_id = ceph_decode_64(&tp);
-		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
-		if (ret)
-			goto out;
-
-		/* all is well, update our ticket */
-		ceph_crypto_key_destroy(&th->session_key);
-		if (th->ticket_blob)
-			ceph_buffer_put(th->ticket_blob);
-		th->session_key = new_session_key;
-		th->ticket_blob = new_ticket_blob;
-		th->validity = new_validity;
-		th->secret_id = new_secret_id;
-		th->expires = new_expires;
-		th->renew_after = new_renew_after;
-		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
-		     type, ceph_entity_type_name(type), th->secret_id,
-		     (int)th->ticket_blob->vec.iov_len);
-		xi->have_keys |= th->service;
-	}
-
-	ret = 0;
-out:
-	kfree(ticket_buf);
-out_dbuf:
-	kfree(dbuf);
-	return ret;
-
-bad:
-	ret = -EINVAL;
-	goto out;
-}
-
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
-				   struct ceph_x_ticket_handler *th,
-				   struct ceph_x_authorizer *au)
-{
-	int maxlen;
-	struct ceph_x_authorize_a *msg_a;
-	struct ceph_x_authorize_b msg_b;
-	void *p, *end;
-	int ret;
-	int ticket_blob_len =
-		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-
-	dout("build_authorizer for %s %p\n",
-	     ceph_entity_type_name(th->service), au);
-
-	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-		ceph_x_encrypt_buflen(ticket_blob_len);
-	dout("  need len %d\n", maxlen);
-	if (au->buf && au->buf->alloc_len < maxlen) {
-		ceph_buffer_put(au->buf);
-		au->buf = NULL;
-	}
-	if (!au->buf) {
-		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
-		if (!au->buf)
-			return -ENOMEM;
-	}
-	au->service = th->service;
-
-	msg_a = au->buf->vec.iov_base;
-	msg_a->struct_v = 1;
-	msg_a->global_id = cpu_to_le64(ac->global_id);
-	msg_a->service_id = cpu_to_le32(th->service);
-	msg_a->ticket_blob.struct_v = 1;
-	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
-	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
-	if (ticket_blob_len) {
-		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
-		       th->ticket_blob->vec.iov_len);
-	}
-	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
-	     le64_to_cpu(msg_a->ticket_blob.secret_id));
-
-	p = msg_a + 1;
-	p += ticket_blob_len;
-	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-
-	get_random_bytes(&au->nonce, sizeof(au->nonce));
-	msg_b.struct_v = 1;
-	msg_b.nonce = cpu_to_le64(au->nonce);
-	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
-			     p, end - p);
-	if (ret < 0)
-		goto out_buf;
-	p += ret;
-	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
-	dout(" built authorizer nonce %llx len %d\n", au->nonce,
-	     (int)au->buf->vec.iov_len);
-	BUG_ON(au->buf->vec.iov_len > maxlen);
-	return 0;
-
-out_buf:
-	ceph_buffer_put(au->buf);
-	au->buf = NULL;
-	return ret;
-}
-
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
-				void **p, void *end)
-{
-	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
-	ceph_encode_8(p, 1);
-	ceph_encode_64(p, th->secret_id);
-	if (th->ticket_blob) {
-		const char *buf = th->ticket_blob->vec.iov_base;
-		u32 len = th->ticket_blob->vec.iov_len;
-
-		ceph_encode_32_safe(p, end, len, bad);
-		ceph_encode_copy_safe(p, end, buf, len, bad);
-	} else {
-		ceph_encode_32_safe(p, end, 0, bad);
-	}
-
-	return 0;
-bad:
-	return -ERANGE;
-}
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
-	int want = ac->want_keys;
-	struct ceph_x_info *xi = ac->private;
-	int service;
-
-	*pneed = ac->want_keys & ~(xi->have_keys);
-
-	for (service = 1; service <= want; service <<= 1) {
-		struct ceph_x_ticket_handler *th;
-
-		if (!(ac->want_keys & service))
-			continue;
-
-		if (*pneed & service)
-			continue;
-
-		th = get_ticket_handler(ac, service);
-
-		if (!th) {
-			*pneed |= service;
-			continue;
-		}
-
-		if (get_seconds() >= th->renew_after)
-			*pneed |= service;
-		if (get_seconds() >= th->expires)
-			xi->have_keys &= ~service;
-	}
-}
-
-
-static int ceph_x_build_request(struct ceph_auth_client *ac,
-				void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	int need;
-	struct ceph_x_request_header *head = buf;
-	int ret;
-	struct ceph_x_ticket_handler *th =
-		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-
-	ceph_x_validate_tickets(ac, &need);
-
-	dout("build_request want %x have %x need %x\n",
-	     ac->want_keys, xi->have_keys, need);
-
-	if (need & CEPH_ENTITY_TYPE_AUTH) {
-		struct ceph_x_authenticate *auth = (void *)(head + 1);
-		void *p = auth + 1;
-		struct ceph_x_challenge_blob tmp;
-		char tmp_enc[40];
-		u64 *u;
-
-		if (p > end)
-			return -ERANGE;
-
-		dout(" get_auth_session_key\n");
-		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-
-		/* encrypt and hash */
-		get_random_bytes(&auth->client_challenge, sizeof(u64));
-		tmp.client_challenge = auth->client_challenge;
-		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
-		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
-				     tmp_enc, sizeof(tmp_enc));
-		if (ret < 0)
-			return ret;
-
-		auth->struct_v = 1;
-		auth->key = 0;
-		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-			auth->key ^= *u;
-		dout(" server_challenge %llx client_challenge %llx key %llx\n",
-		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
-		     le64_to_cpu(auth->key));
-
-		/* now encode the old ticket if exists */
-		ret = ceph_x_encode_ticket(th, &p, end);
-		if (ret < 0)
-			return ret;
-
-		return p - buf;
-	}
-
-	if (need) {
-		void *p = head + 1;
-		struct ceph_x_service_ticket_request *req;
-
-		if (p > end)
-			return -ERANGE;
-		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
-		BUG_ON(!th);
-		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
-		if (ret)
-			return ret;
-		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-				 xi->auth_authorizer.buf->vec.iov_len);
-
-		req = p;
-		req->keys = cpu_to_le32(need);
-		p += sizeof(*req);
-		return p - buf;
-	}
-
-	return 0;
-}
-
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
-{
-	struct ceph_x_info *xi = ac->private;
-	struct ceph_x_reply_header *head = buf;
-	struct ceph_x_ticket_handler *th;
-	int len = end - buf;
-	int op;
-	int ret;
-
-	if (result)
-		return result;  /* XXX hmm? */
-
-	if (xi->starting) {
-		/* it's a hello */
-		struct ceph_x_server_challenge *sc = buf;
-
-		if (len != sizeof(*sc))
-			return -EINVAL;
-		xi->server_challenge = le64_to_cpu(sc->server_challenge);
-		dout("handle_reply got server challenge %llx\n",
-		     xi->server_challenge);
-		xi->starting = false;
-		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
-		return -EAGAIN;
-	}
-
-	op = le16_to_cpu(head->op);
-	result = le32_to_cpu(head->result);
-	dout("handle_reply op %d result %d\n", op, result);
-	switch (op) {
-	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-					       buf + sizeof(*head), end);
-		break;
-
-	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-		BUG_ON(!th);
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-					       buf + sizeof(*head), end);
-		break;
-
-	default:
-		return -EINVAL;
-	}
-	if (ret)
-		return ret;
-	if (ac->want_keys == xi->have_keys)
-		return 0;
-	return -EAGAIN;
-}
-
-static int ceph_x_create_authorizer(
-	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
-{
-	struct ceph_x_authorizer *au;
-	struct ceph_x_ticket_handler *th;
-	int ret;
-
-	th = get_ticket_handler(ac, peer_type);
-	if (IS_ERR(th))
-		return PTR_ERR(th);
-
-	au = kzalloc(sizeof(*au), GFP_NOFS);
-	if (!au)
-		return -ENOMEM;
-
-	ret = ceph_x_build_authorizer(ac, th, au);
-	if (ret) {
-		kfree(au);
-		return ret;
-	}
-
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf->vec.iov_base;
-	*len = au->buf->vec.iov_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
-	return 0;
-}
-
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a, size_t len)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-	struct ceph_x_ticket_handler *th;
-	int ret = 0;
-	struct ceph_x_authorize_reply reply;
-	void *p = au->reply_buf;
-	void *end = p + sizeof(au->reply_buf);
-
-	th = get_ticket_handler(ac, au->service);
-	if (!th)
-		return -EIO;  /* hrm! */
-	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
-	if (ret < 0)
-		return ret;
-	if (ret != sizeof(reply))
-		return -EPERM;
-
-	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
-	return ret;
-}
-
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-
-	ceph_buffer_put(au->buf);
-	kfree(au);
-}
-
-
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-
-	dout("reset\n");
-	xi->starting = true;
-	xi->server_challenge = 0;
-}
-
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi = ac->private;
-	struct rb_node *p;
-
-	dout("ceph_x_destroy %p\n", ac);
-	ceph_crypto_key_destroy(&xi->secret);
-
-	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
-		struct ceph_x_ticket_handler *th =
-			rb_entry(p, struct ceph_x_ticket_handler, node);
-		remove_ticket_handler(ac, th);
-	}
-
-	if (xi->auth_authorizer.buf)
-		ceph_buffer_put(xi->auth_authorizer.buf);
-
-	kfree(ac->private);
-	ac->private = NULL;
-}
-
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-				   int peer_type)
-{
-	struct ceph_x_ticket_handler *th;
-
-	th = get_ticket_handler(ac, peer_type);
-	if (th && !IS_ERR(th))
-		remove_ticket_handler(ac, th);
-}
-
-
-static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
-	.is_authenticated = ceph_x_is_authenticated,
-	.should_authenticate = ceph_x_should_authenticate,
-	.build_request = ceph_x_build_request,
-	.handle_reply = ceph_x_handle_reply,
-	.create_authorizer = ceph_x_create_authorizer,
-	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-	.destroy_authorizer = ceph_x_destroy_authorizer,
-	.invalidate_authorizer = ceph_x_invalidate_authorizer,
-	.reset =  ceph_x_reset,
-	.destroy = ceph_x_destroy,
-};
-
-
-int ceph_x_init(struct ceph_auth_client *ac)
-{
-	struct ceph_x_info *xi;
-	int ret;
-
-	dout("ceph_x_init %p\n", ac);
-	ret = -ENOMEM;
-	xi = kzalloc(sizeof(*xi), GFP_NOFS);
-	if (!xi)
-		goto out;
-
-	ret = -EINVAL;
-	if (!ac->secret) {
-		pr_err("no secret set (for auth_x protocol)\n");
-		goto out_nomem;
-	}
-
-	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
-	if (ret)
-		goto out_nomem;
-
-	xi->starting = true;
-	xi->ticket_handlers = RB_ROOT;
-
-	ac->protocol = CEPH_AUTH_CEPHX;
-	ac->private = xi;
-	ac->ops = &ceph_x_ops;
-	return 0;
-
-out_nomem:
-	kfree(xi);
-out:
-	return ret;
-}
-
-
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e68..00000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-
-#include <linux/rbtree.h>
-
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
-	struct rb_node node;
-	unsigned service;
-
-	struct ceph_crypto_key session_key;
-	struct ceph_timespec validity;
-
-	u64 secret_id;
-	struct ceph_buffer *ticket_blob;
-
-	unsigned long renew_after, expires;
-};
-
-
-struct ceph_x_authorizer {
-	struct ceph_buffer *buf;
-	unsigned service;
-	u64 nonce;
-	char reply_buf[128];  /* big enough for encrypted blob */
-};
-
-struct ceph_x_info {
-	struct ceph_crypto_key secret;
-
-	bool starting;
-	u64 server_challenge;
-
-	unsigned have_keys;
-	struct rb_root ticket_handlers;
-
-	struct ceph_x_authorizer auth_authorizer;
-};
-
-extern int ceph_x_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4..00000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-
-#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY          0x0400
-
-/* common bits */
-struct ceph_x_ticket_blob {
-	__u8 struct_v;
-	__le64 secret_id;
-	__le32 blob_len;
-	char blob[];
-} __attribute__ ((packed));
-
-
-/* common request/reply headers */
-struct ceph_x_request_header {
-	__le16 op;
-} __attribute__ ((packed));
-
-struct ceph_x_reply_header {
-	__le16 op;
-	__le32 result;
-} __attribute__ ((packed));
-
-
-/* authenticate handshake */
-
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
-	__u8 struct_v;
-	__le64 server_challenge;
-} __attribute__ ((packed));
-
-struct ceph_x_authenticate {
-	__u8 struct_v;
-	__le64 client_challenge;
-	__le64 key;
-	/* ticket blob */
-} __attribute__ ((packed));
-
-struct ceph_x_service_ticket_request {
-	__u8 struct_v;
-	__le32 keys;
-} __attribute__ ((packed));
-
-struct ceph_x_challenge_blob {
-	__le64 server_challenge;
-	__le64 client_challenge;
-} __attribute__ ((packed));
-
-
-
-/* authorize handshake */
-
-/*
- * The authorizer consists of two pieces:
- *  a - service id, ticket blob
- *  b - encrypted with session key
- */
-struct ceph_x_authorize_a {
-	__u8 struct_v;
-	__le64 global_id;
-	__le32 service_id;
-	struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_b {
-	__u8 struct_v;
-	__le64 nonce;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_reply {
-	__u8 struct_v;
-	__le64 nonce_plus_one;
-} __attribute__ ((packed));
-
-
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-
-struct ceph_x_encrypt_header {
-	__u8 struct_v;
-	__le64 magic;
-} __attribute__ ((packed));
-
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index c67535d70aa..00000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,81 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-
-#include "buffer.h"
-#include "decode.h"
-
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
-	struct ceph_buffer *b;
-
-	b = kmalloc(sizeof(*b), gfp);
-	if (!b)
-		return NULL;
-
-	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-	if (b->vec.iov_base) {
-		b->is_vmalloc = false;
-	} else {
-		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-		if (!b->vec.iov_base) {
-			kfree(b);
-			return NULL;
-		}
-		b->is_vmalloc = true;
-	}
-
-	kref_init(&b->kref);
-	b->alloc_len = len;
-	b->vec.iov_len = len;
-	dout("buffer_new %p\n", b);
-	return b;
-}
-
-void ceph_buffer_release(struct kref *kref)
-{
-	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-
-	dout("buffer_release %p\n", b);
-	if (b->vec.iov_base) {
-		if (b->is_vmalloc)
-			vfree(b->vec.iov_base);
-		else
-			kfree(b->vec.iov_base);
-	}
-	kfree(b);
-}
-
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
-{
-	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-	if (b->vec.iov_base) {
-		b->is_vmalloc = false;
-	} else {
-		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-		b->is_vmalloc = true;
-	}
-	if (!b->vec.iov_base)
-		return -ENOMEM;
-	b->alloc_len = len;
-	b->vec.iov_len = len;
-	return 0;
-}
-
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
-	size_t len;
-
-	ceph_decode_need(p, end, sizeof(u32), bad);
-	len = ceph_decode_32(p);
-	dout("decode_buffer len %d\n", (int)len);
-	ceph_decode_need(p, end, len, bad);
-	*b = ceph_buffer_new(len, GFP_NOFS);
-	if (!*b)
-		return -ENOMEM;
-	ceph_decode_copy(p, (*b)->vec.iov_base, len);
-	return 0;
-bad:
-	return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068..00000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
-	struct kref kref;
-	struct kvec vec;
-	size_t alloc_len;
-	bool is_vmalloc;
-};
-
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
-	kref_get(&b->kref);
-	return b;
-}
-
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
-	kref_put(&b->kref, ceph_buffer_release);
-}
-
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b81be9a5648..98ab13e2b71 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -9,8 +9,9 @@
 #include <linux/writeback.h>
 
 #include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
 
 /*
  * Capability management
@@ -113,58 +114,41 @@ const char *ceph_cap_string(int caps)
 	return cap_str[i];
 }
 
-/*
- * Cap reservations
- *
- * Maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations.  This ensures that we preallocate
- * memory needed to successfully process an MDS response.  (If an MDS
- * sends us cap information and we fail to process it, we will have
- * problems due to the client and MDS being out of sync.)
- *
- * Reservations are 'owned' by a ceph_cap_reservation context.
- */
-static spinlock_t caps_list_lock;
-static struct list_head caps_list;  /* unused (reserved or unreserved) */
-static int caps_total_count;        /* total caps allocated */
-static int caps_use_count;          /* in use */
-static int caps_reserve_count;      /* unused, reserved */
-static int caps_avail_count;        /* unused, unreserved */
-static int caps_min_count;          /* keep at least this many (unreserved) */
-
-void __init ceph_caps_init(void)
+void ceph_caps_init(struct ceph_mds_client *mdsc)
 {
-	INIT_LIST_HEAD(&caps_list);
-	spin_lock_init(&caps_list_lock);
+	INIT_LIST_HEAD(&mdsc->caps_list);
+	spin_lock_init(&mdsc->caps_list_lock);
 }
 
-void ceph_caps_finalize(void)
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
 {
 	struct ceph_cap *cap;
 
-	spin_lock(&caps_list_lock);
-	while (!list_empty(&caps_list)) {
-		cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+	spin_lock(&mdsc->caps_list_lock);
+	while (!list_empty(&mdsc->caps_list)) {
+		cap = list_first_entry(&mdsc->caps_list,
+				       struct ceph_cap, caps_item);
 		list_del(&cap->caps_item);
 		kmem_cache_free(ceph_cap_cachep, cap);
 	}
-	caps_total_count = 0;
-	caps_avail_count = 0;
-	caps_use_count = 0;
-	caps_reserve_count = 0;
-	caps_min_count = 0;
-	spin_unlock(&caps_list_lock);
+	mdsc->caps_total_count = 0;
+	mdsc->caps_avail_count = 0;
+	mdsc->caps_use_count = 0;
+	mdsc->caps_reserve_count = 0;
+	mdsc->caps_min_count = 0;
+	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_adjust_min_caps(int delta)
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
 {
-	spin_lock(&caps_list_lock);
-	caps_min_count += delta;
-	BUG_ON(caps_min_count < 0);
-	spin_unlock(&caps_list_lock);
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_min_count += delta;
+	BUG_ON(mdsc->caps_min_count < 0);
+	spin_unlock(&mdsc->caps_list_lock);
 }
 
-int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+		      struct ceph_cap_reservation *ctx, int need)
 {
 	int i;
 	struct ceph_cap *cap;
@@ -176,16 +160,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
 	dout("reserve caps ctx=%p need=%d\n", ctx, need);
 
 	/* first reserve any caps that are already allocated */
-	spin_lock(&caps_list_lock);
-	if (caps_avail_count >= need)
+	spin_lock(&mdsc->caps_list_lock);
+	if (mdsc->caps_avail_count >= need)
 		have = need;
 	else
-		have = caps_avail_count;
-	caps_avail_count -= have;
-	caps_reserve_count += have;
-	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
-	       caps_avail_count);
-	spin_unlock(&caps_list_lock);
+		have = mdsc->caps_avail_count;
+	mdsc->caps_avail_count -= have;
+	mdsc->caps_reserve_count += have;
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
 
 	for (i = have; i < need; i++) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +183,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
 	}
 	BUG_ON(have + alloc != need);
 
-	spin_lock(&caps_list_lock);
-	caps_total_count += alloc;
-	caps_reserve_count += alloc;
-	list_splice(&newcaps, &caps_list);
+	spin_lock(&mdsc->caps_list_lock);
+	mdsc->caps_total_count += alloc;
+	mdsc->caps_reserve_count += alloc;
+	list_splice(&newcaps, &mdsc->caps_list);
 
-	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
-	       caps_avail_count);
-	spin_unlock(&caps_list_lock);
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+					 mdsc->caps_reserve_count +
+					 mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
 
 	ctx->count = need;
 	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
-	     ctx, caps_total_count, caps_use_count, caps_reserve_count,
-	     caps_avail_count);
+	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
 	return 0;
 
 out_alloc_count:
@@ -220,26 +206,29 @@ out_alloc_count:
 	return ret;
 }
 
-int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			struct ceph_cap_reservation *ctx)
 {
 	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 	if (ctx->count) {
-		spin_lock(&caps_list_lock);
-		BUG_ON(caps_reserve_count < ctx->count);
-		caps_reserve_count -= ctx->count;
-		caps_avail_count += ctx->count;
+		spin_lock(&mdsc->caps_list_lock);
+		BUG_ON(mdsc->caps_reserve_count < ctx->count);
+		mdsc->caps_reserve_count -= ctx->count;
+		mdsc->caps_avail_count += ctx->count;
 		ctx->count = 0;
 		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
-		     caps_total_count, caps_use_count, caps_reserve_count,
-		     caps_avail_count);
-		BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
-		       caps_avail_count);
-		spin_unlock(&caps_list_lock);
+		     mdsc->caps_total_count, mdsc->caps_use_count,
+		     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+		BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+						 mdsc->caps_reserve_count +
+						 mdsc->caps_avail_count);
+		spin_unlock(&mdsc->caps_list_lock);
 	}
 	return 0;
 }
 
-static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
+				struct ceph_cap_reservation *ctx)
 {
 	struct ceph_cap *cap = NULL;
 
@@ -247,71 +236,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
 	if (!ctx) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 		if (cap) {
-			caps_use_count++;
-			caps_total_count++;
+			mdsc->caps_use_count++;
+			mdsc->caps_total_count++;
 		}
 		return cap;
 	}
 
-	spin_lock(&caps_list_lock);
+	spin_lock(&mdsc->caps_list_lock);
 	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-	     ctx, ctx->count, caps_total_count, caps_use_count,
-	     caps_reserve_count, caps_avail_count);
+	     ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
 	BUG_ON(!ctx->count);
-	BUG_ON(ctx->count > caps_reserve_count);
-	BUG_ON(list_empty(&caps_list));
+	BUG_ON(ctx->count > mdsc->caps_reserve_count);
+	BUG_ON(list_empty(&mdsc->caps_list));
 
 	ctx->count--;
-	caps_reserve_count--;
-	caps_use_count++;
+	mdsc->caps_reserve_count--;
+	mdsc->caps_use_count++;
 
-	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
 	list_del(&cap->caps_item);
 
-	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
-	       caps_avail_count);
-	spin_unlock(&caps_list_lock);
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
 	return cap;
 }
 
-void ceph_put_cap(struct ceph_cap *cap)
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 {
-	spin_lock(&caps_list_lock);
+	spin_lock(&mdsc->caps_list_lock);
 	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
-	     cap, caps_total_count, caps_use_count,
-	     caps_reserve_count, caps_avail_count);
-	caps_use_count--;
+	     cap, mdsc->caps_total_count, mdsc->caps_use_count,
+	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	mdsc->caps_use_count--;
 	/*
 	 * Keep some preallocated caps around (ceph_min_count), to
 	 * avoid lots of free/alloc churn.
 	 */
-	if (caps_avail_count >= caps_reserve_count + caps_min_count) {
-		caps_total_count--;
+	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
+				      mdsc->caps_min_count) {
+		mdsc->caps_total_count--;
 		kmem_cache_free(ceph_cap_cachep, cap);
 	} else {
-		caps_avail_count++;
-		list_add(&cap->caps_item, &caps_list);
+		mdsc->caps_avail_count++;
+		list_add(&cap->caps_item, &mdsc->caps_list);
 	}
 
-	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
-	       caps_avail_count);
-	spin_unlock(&caps_list_lock);
+	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+	       mdsc->caps_reserve_count + mdsc->caps_avail_count);
+	spin_unlock(&mdsc->caps_list_lock);
 }
 
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
 			     int *total, int *avail, int *used, int *reserved,
 			     int *min)
 {
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
 	if (total)
-		*total = caps_total_count;
+		*total = mdsc->caps_total_count;
 	if (avail)
-		*avail = caps_avail_count;
+		*avail = mdsc->caps_avail_count;
 	if (used)
-		*used = caps_use_count;
+		*used = mdsc->caps_use_count;
 	if (reserved)
-		*reserved = caps_reserve_count;
+		*reserved = mdsc->caps_reserve_count;
 	if (min)
-		*min = caps_min_count;
+		*min = mdsc->caps_min_count;
 }
 
 /*
@@ -336,22 +328,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 	return NULL;
 }
 
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&ci->vfs_inode.i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return cap;
+}
+
 /*
- * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
- * -1.
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
  */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
 {
 	struct ceph_cap *cap;
 	int mds = -1;
 	struct rb_node *p;
 
-	/* prefer mds with WR|WRBUFFER|EXCL caps */
+	/* prefer mds with WR|BUFFER|EXCL caps */
 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 		cap = rb_entry(p, struct ceph_cap, ci_node);
 		mds = cap->mds;
-		if (mseq)
-			*mseq = cap->mseq;
 		if (cap->issued & (CEPH_CAP_FILE_WR |
 				   CEPH_CAP_FILE_BUFFER |
 				   CEPH_CAP_FILE_EXCL))
@@ -364,7 +363,7 @@ int ceph_get_cap_mds(struct inode *inode)
 {
 	int mds;
 	spin_lock(&inode->i_lock);
-	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+	mds = __ceph_get_cap_mds(ceph_inode(inode));
 	spin_unlock(&inode->i_lock);
 	return mds;
 }
@@ -401,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_args *ma = mdsc->client->mount_args;
+	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
 					    ma->caps_wanted_delay_min * HZ);
@@ -483,8 +482,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 	 * Each time we receive FILE_CACHE anew, we increment
 	 * i_rdcache_gen.
 	 */
-	if ((issued & CEPH_CAP_FILE_CACHE) &&
-	    (had & CEPH_CAP_FILE_CACHE) == 0)
+	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
 		ci->i_rdcache_gen++;
 
 	/*
@@ -517,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
 		 unsigned seq, unsigned mseq, u64 realmino, int flags,
 		 struct ceph_cap_reservation *caps_reservation)
 {
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *new_cap = NULL;
 	struct ceph_cap *cap;
@@ -543,7 +542,7 @@ retry:
 			new_cap = NULL;
 		} else {
 			spin_unlock(&inode->i_lock);
-			new_cap = get_cap(caps_reservation);
+			new_cap = get_cap(mdsc, caps_reservation);
 			if (new_cap == NULL)
 				return -ENOMEM;
 			goto retry;
@@ -588,6 +587,7 @@ retry:
 		} else {
 			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
 			       realmino);
+			WARN_ON(!realm);
 		}
 	}
 
@@ -815,7 +815,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
 		used |= CEPH_CAP_PIN;
 	if (ci->i_rd_ref)
 		used |= CEPH_CAP_FILE_RD;
-	if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+	if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
 		used |= CEPH_CAP_FILE_CACHE;
 	if (ci->i_wr_ref)
 		used |= CEPH_CAP_FILE_WR;
@@ -831,7 +831,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
 	int want = 0;
 	int mode;
-	for (mode = 0; mode < 4; mode++)
+	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
 		if (ci->i_nr_by_mode[mode])
 			want |= ceph_caps_for_mode(mode);
 	return want;
@@ -874,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -901,7 +901,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 		ci->i_auth_cap = NULL;
 
 	if (removed)
-		ceph_put_cap(cap);
+		ceph_put_cap(mdsc, cap);
 
 	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1083,6 +1083,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	gid_t gid;
 	struct ceph_mds_session *session;
 	u64 xattr_version = 0;
+	struct ceph_buffer *xattr_blob = NULL;
 	int delayed = 0;
 	u64 flush_tid = 0;
 	int i;
@@ -1143,6 +1144,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		for (i = 0; i < CEPH_CAP_BITS; i++)
 			if (flushing & (1 << i))
 				ci->i_cap_flush_tid[i] = flush_tid;
+
+		follows = ci->i_head_snapc->seq;
+	} else {
+		follows = 0;
 	}
 
 	keep = cap->implemented;
@@ -1156,14 +1161,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	mtime = inode->i_mtime;
 	atime = inode->i_atime;
 	time_warp_seq = ci->i_time_warp_seq;
-	follows = ci->i_snap_realm->cached_context->seq;
 	uid = inode->i_uid;
 	gid = inode->i_gid;
 	mode = inode->i_mode;
 
-	if (dropping & CEPH_CAP_XATTR_EXCL) {
+	if (flushing & CEPH_CAP_XATTR_EXCL) {
 		__ceph_build_xattrs_blob(ci);
-		xattr_version = ci->i_xattrs.version + 1;
+		xattr_blob = ci->i_xattrs.blob;
+		xattr_version = ci->i_xattrs.version;
 	}
 
 	spin_unlock(&inode->i_lock);
@@ -1171,9 +1176,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
 		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
 		size, max_size, &mtime, &atime, time_warp_seq,
-		uid, gid, mode,
-		xattr_version,
-		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+		uid, gid, mode, xattr_version, xattr_blob,
 		follows);
 	if (ret < 0) {
 		dout("error sending cap msg, must requeue %p\n", inode);
@@ -1193,16 +1196,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
  * Called under i_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
-			struct ceph_mds_session **psession)
+			struct ceph_mds_session **psession,
+			int again)
+		__releases(ci->vfs_inode->i_lock)
+		__acquires(ci->vfs_inode->i_lock)
 {
 	struct inode *inode = &ci->vfs_inode;
 	int mds;
 	struct ceph_cap_snap *capsnap;
 	u32 mseq;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
 						    session->s_mutex */
 	u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1223,7 +1232,7 @@ retry:
 		 * pages to be written out.
 		 */
 		if (capsnap->dirty_pages || capsnap->writing)
-			continue;
+			break;
 
 		/*
 		 * if cap writeback already occurred, we should have dropped
@@ -1232,7 +1241,20 @@ retry:
 		BUG_ON(capsnap->dirty == 0);
 
 		/* pick mds, take s_mutex */
-		mds = __ceph_get_cap_mds(ci, &mseq);
+		if (ci->i_auth_cap == NULL) {
+			dout("no auth cap (migrating?), doing nothing\n");
+			goto out;
+		}
+
+		/* only flush each capsnap once */
+		if (!again && !list_empty(&capsnap->flushing_item)) {
+			dout("already flushed %p, skipping\n", capsnap);
+			continue;
+		}
+
+		mds = ci->i_auth_cap->session->s_mds;
+		mseq = ci->i_auth_cap->mseq;
+
 		if (session && session->s_mds != mds) {
 			dout("oops, wrong session %p mutex\n", session);
 			mutex_unlock(&session->s_mutex);
@@ -1251,8 +1273,8 @@ retry:
 			}
 			/*
 			 * if session == NULL, we raced against a cap
-			 * deletion.  retry, and we'll get a better
-			 * @mds value next time.
+			 * deletion or migration.  retry, and we'll
+			 * get a better @mds value next time.
 			 */
 			spin_lock(&inode->i_lock);
 			goto retry;
@@ -1266,8 +1288,8 @@ retry:
 			      &session->s_cap_snaps_flushing);
 		spin_unlock(&inode->i_lock);
 
-		dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
-		     inode, capsnap, next_follows, capsnap->size);
+		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
+		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
 		send_cap_msg(session, ceph_vino(inode).ino, 0,
 			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
 			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1275,7 +1297,7 @@ retry:
 			     &capsnap->mtime, &capsnap->atime,
 			     capsnap->time_warp_seq,
 			     capsnap->uid, capsnap->gid, capsnap->mode,
-			     0, NULL,
+			     capsnap->xattr_version, capsnap->xattr_blob,
 			     capsnap->follows);
 
 		next_follows = capsnap->follows + 1;
@@ -1290,6 +1312,7 @@ retry:
 	list_del_init(&ci->i_snap_flush_item);
 	spin_unlock(&mdsc->snap_flush_lock);
 
+out:
 	if (psession)
 		*psession = session;
 	else if (session) {
@@ -1303,7 +1326,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 	struct inode *inode = &ci->vfs_inode;
 
 	spin_lock(&inode->i_lock);
-	__ceph_flush_snaps(ci, NULL);
+	__ceph_flush_snaps(ci, NULL, 0);
 	spin_unlock(&inode->i_lock);
 }
 
@@ -1314,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc =
-		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
@@ -1324,7 +1347,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	     ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
-		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		if (!ci->i_head_snapc)
+			ci->i_head_snapc = ceph_get_snap_context(
+				ci->i_snap_realm->cached_context);
+		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+			ci->i_head_snapc);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1352,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
 
@@ -1390,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
 /*
  * try to invalidate mapping pages without blocking.
  */
-static int mapping_is_empty(struct address_space *mapping)
-{
-	struct page *page = find_get_page(mapping, 0);
-
-	if (!page)
-		return 1;
-
-	put_page(page);
-	return 0;
-}
-
 static int try_nonblocking_invalidate(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1410,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	invalidate_mapping_pages(&inode->i_data, 0, -1);
 	spin_lock(&inode->i_lock);
 
-	if (mapping_is_empty(&inode->i_data) &&
+	if (inode->i_data.nrpages == 0 &&
 	    invalidating_gen == ci->i_rdcache_gen) {
 		/* success. */
 		dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1435,10 +1451,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
  */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
-	__releases(session->s_mutex)
 {
-	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
 	int file_wanted, used;
@@ -1463,7 +1478,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 
 	/* flush snaps first time around only */
 	if (!list_empty(&ci->i_cap_snaps))
-		__ceph_flush_snaps(ci, &session);
+		__ceph_flush_snaps(ci, &session, 0);
 	goto retry_locked;
 retry:
 	spin_lock(&inode->i_lock);
@@ -1508,13 +1523,15 @@ retry_locked:
 	 */
 	if ((!is_delayed || mdsc->stopping) &&
 	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    inode->i_data.nrpages &&                 /* have cached pages */
 	    (file_wanted == 0 ||                     /* no open files */
-	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+	     (revoking & (CEPH_CAP_FILE_CACHE|
+			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
 	    !tried_invalidate) {
 		dout("check_caps trying to invalidate on %p\n", inode);
 		if (try_nonblocking_invalidate(inode) < 0) {
-			if (revoking & CEPH_CAP_FILE_CACHE) {
+			if (revoking & (CEPH_CAP_FILE_CACHE|
+					CEPH_CAP_FILE_LAZYIO)) {
 				dout("check_caps queuing invalidate\n");
 				queue_invalidate = 1;
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -1679,7 +1696,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 			  unsigned *flush_tid)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int unlock_session = session ? 0 : 1;
 	int flushing = 0;
@@ -1845,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 				       caps_are_flushed(inode, flush_tid));
 	} else {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&inode->i_lock);
 		if (__ceph_caps_dirty(ci))
@@ -1878,7 +1895,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
 		if (cap && cap->session == session) {
 			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
 			     cap, capsnap);
-			__ceph_flush_snaps(ci, &session);
+			__ceph_flush_snaps(ci, &session, 1);
 		} else {
 			pr_err("%p auth cap %p not mds%d ???\n", inode,
 			       cap, session->s_mds);
@@ -2181,7 +2198,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 
 	if (ci->i_head_snapc == snapc) {
 		ci->i_wrbuffer_ref_head -= nr;
-		if (!ci->i_wrbuffer_ref_head) {
+		if (ci->i_wrbuffer_ref_head == 0 &&
+		    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+			BUG_ON(!ci->i_head_snapc);
 			ceph_put_snap_context(ci->i_head_snapc);
 			ci->i_head_snapc = NULL;
 		}
@@ -2250,12 +2269,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 			     struct ceph_mds_session *session,
 			     struct ceph_cap *cap,
 			     struct ceph_buffer *xattr_buf)
-	__releases(inode->i_lock)
-	__releases(session->s_mutex)
+		__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
-	int seq = le32_to_cpu(grant->seq);
+	unsigned seq = le32_to_cpu(grant->seq);
+	unsigned issue_seq = le32_to_cpu(grant->issue_seq);
 	int newcaps = le32_to_cpu(grant->caps);
 	int issued, implemented, used, wanted, dirty;
 	u64 size = le64_to_cpu(grant->size);
@@ -2267,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	int revoked_rdcache = 0;
 	int queue_invalidate = 0;
 
-	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+	     inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
 	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
 		inode->i_size);
 
@@ -2278,6 +2297,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	 * will invalidate _after_ writeback.)
 	 */
 	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
 	    !ci->i_wrbuffer_ref) {
 		if (try_nonblocking_invalidate(inode) == 0) {
 			revoked_rdcache = 1;
@@ -2363,21 +2383,29 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	}
 
 	cap->seq = seq;
+	cap->issue_seq = issue_seq;
 
 	/* file layout may have changed */
 	ci->i_layout = grant->layout;
 
 	/* revocation, grant, or no-op? */
 	if (cap->issued & ~newcaps) {
-		dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
-		     ceph_cap_string(newcaps));
-		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
-			writeback = 1; /* will delay ack */
-		else if (dirty & ~newcaps)
-			check_caps = 1;  /* initiate writeback in check_caps */
-		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
-			   revoked_rdcache)
-			check_caps = 2;     /* send revoke ack in check_caps */
+		int revoking = cap->issued & ~newcaps;
+
+		dout("revocation: %s -> %s (revoking %s)\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps),
+		     ceph_cap_string(revoking));
+		if (revoking & used & CEPH_CAP_FILE_BUFFER)
+			writeback = 1;  /* initiate writeback; will delay ack */
+		else if (revoking == CEPH_CAP_FILE_CACHE &&
+			 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+			 queue_invalidate)
+			; /* do nothing yet, invalidation will be queued */
+		else if (cap == ci->i_auth_cap)
+			check_caps = 1; /* check auth cap only */
+		else
+			check_caps = 2; /* check all caps */
 		cap->issued = newcaps;
 		cap->implemented |= newcaps;
 	} else if (cap->issued == newcaps) {
@@ -2427,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(inode->i_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
@@ -2467,6 +2495,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 			dout(" inode %p now clean\n", inode);
 			BUG_ON(!list_empty(&ci->i_dirty_item));
 			drop = 1;
+			if (ci->i_wrbuffer_ref_head == 0) {
+				BUG_ON(!ci->i_head_snapc);
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
 		} else {
 			BUG_ON(list_empty(&ci->i_dirty_item));
 		}
@@ -2568,7 +2601,8 @@ static void handle_cap_trunc(struct inode *inode,
  * caller holds s_mutex
  */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-			      struct ceph_mds_session *session)
+			      struct ceph_mds_session *session,
+			      int *open_target_sessions)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
@@ -2600,6 +2634,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			ci->i_cap_exporting_mds = mds;
 			ci->i_cap_exporting_mseq = mseq;
 			ci->i_cap_exporting_issued = cap->issued;
+
+			/*
+			 * make sure we have open sessions with all possible
+			 * export targets, so that we get the matching IMPORT
+			 */
+			*open_target_sessions = 1;
 		}
 		__ceph_remove_cap(cap);
 	}
@@ -2663,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
@@ -2675,6 +2715,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u64 size, max_size;
 	u64 tid;
 	void *snaptrace;
+	size_t snaptrace_len;
+	void *flock;
+	u32 flock_len;
+	int open_target_sessions = 0;
 
 	dout("handle_caps from mds%d\n", mds);
 
@@ -2683,7 +2727,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
 	h = msg->front.iov_base;
-	snaptrace = h + 1;
 	op = le32_to_cpu(h->op);
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
@@ -2693,6 +2736,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	size = le64_to_cpu(h->size);
 	max_size = le64_to_cpu(h->max_size);
 
+	snaptrace = h + 1;
+	snaptrace_len = le32_to_cpu(h->snap_trace_len);
+
+	if (le16_to_cpu(msg->hdr.version) >= 2) {
+		void *p, *end;
+
+		p = snaptrace + snaptrace_len;
+		end = msg->front.iov_base + msg->front.iov_len;
+		ceph_decode_32_safe(&p, end, flock_len, bad);
+		flock = p;
+	} else {
+		flock = NULL;
+		flock_len = 0;
+	}
+
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2708,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		if (op == CEPH_CAP_OP_IMPORT)
 			__queue_cap_release(session, vino.ino, cap_id,
 					    mseq, seq);
-
-		/*
-		 * send any full release message to try to move things
-		 * along for the mds (who clearly thinks we still have this
-		 * cap).
-		 */
-		ceph_add_cap_releases(mdsc, session, -1);
-		ceph_send_cap_releases(mdsc, session);
-		goto done;
+		goto flush_cap_releases;
 	}
 
 	/* these will work even if we don't have a cap yet */
@@ -2726,12 +2776,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		goto done;
 
 	case CEPH_CAP_OP_EXPORT:
-		handle_cap_export(inode, h, session);
+		handle_cap_export(inode, h, session, &open_target_sessions);
 		goto done;
 
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
-				  snaptrace, le32_to_cpu(h->snap_trace_len));
+				  snaptrace, snaptrace_len);
 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
 				session);
 		goto done_unlocked;
@@ -2744,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		dout(" no cap on %p ino %llx.%llx from mds%d\n",
 		     inode, ceph_ino(inode), ceph_snap(inode), mds);
 		spin_unlock(&inode->i_lock);
-		goto done;
+		goto flush_cap_releases;
 	}
 
 	/* note that each of these drops i_lock for us */
@@ -2768,11 +2818,24 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		       ceph_cap_op_name(op));
 	}
 
+	goto done;
+
+flush_cap_releases:
+	/*
+	 * send any full release message to try to move things
+	 * along for the mds (who clearly thinks we still have this
+	 * cap).
+	 */
+	ceph_add_cap_releases(mdsc, session);
+	ceph_send_cap_releases(mdsc, session);
+
 done:
 	mutex_unlock(&session->s_mutex);
 done_unlocked:
 	if (inode)
 		iput(inode);
+	if (open_target_sessions)
+		ceph_mdsc_open_export_target_sessions(mdsc, session);
 	return;
 
 bad:
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c230561..00000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-#  define dout(fmt, ...)						\
-	pr_debug(" %12.12s:%-4d : " fmt,				\
-		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
-		 __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)	do {				\
-		if (0)						\
-			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
-	} while (0)
-# endif
-
-#else
-
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
-
-#endif
-
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c409..bdce8b1fbd0 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
 /*
  * Ceph 'frag' type
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
 int ceph_frag_compare(__u32 a, __u32 b)
 {
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 793f50cb7c2..00000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef _FS_CEPH_FRAG_H
-#define _FS_CEPH_FRAG_H
-
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask.  Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- *   8 upper bits = "bits"
- *  24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value.  This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically.  However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
-	return (b << 24) |
-		(v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
-	return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
-	return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
-	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
-	return 24 - ceph_frag_bits(f);
-}
-
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
-	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-	/* is sub as specific as us, and contained by us? */
-	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f) - 1,
-			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-	return ceph_frag_bits(f) > 0 &&
-		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f),
-		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f)+1,
-	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
-	int newbits = ceph_frag_bits(f) + by;
-	return ceph_frag_make(newbits,
-			 ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
-	return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
-	return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
-	return ceph_frag_make(ceph_frag_bits(f),
-			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 79d76bc4303..00000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
-	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
-	__u32 os = le32_to_cpu(layout->fl_object_size);
-
-	/* stripe unit, object size must be non-zero, 64k increment */
-	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
-		return 0;
-	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
-		return 0;
-	/* object size must be a multiple of stripe unit */
-	if (os < su || os % su)
-		return 0;
-	/* stripe count must be non-zero */
-	if (!sc)
-		return 0;
-	return 1;
-}
-
-
-int ceph_flags_to_mode(int flags)
-{
-#ifdef O_DIRECTORY  /* fixme */
-	if ((flags & O_DIRECTORY) == O_DIRECTORY)
-		return CEPH_FILE_MODE_PIN;
-#endif
-#ifdef O_LAZY
-	if (flags & O_LAZY)
-		return CEPH_FILE_MODE_LAZY;
-#endif
-	if ((flags & O_APPEND) == O_APPEND)
-		flags |= O_WRONLY;
-
-	flags &= O_ACCMODE;
-	if ((flags & O_RDWR) == O_RDWR)
-		return CEPH_FILE_MODE_RDWR;
-	if ((flags & O_WRONLY) == O_WRONLY)
-		return CEPH_FILE_MODE_WR;
-	return CEPH_FILE_MODE_RD;
-}
-
-int ceph_caps_for_mode(int mode)
-{
-	switch (mode) {
-	case CEPH_FILE_MODE_PIN:
-		return CEPH_CAP_PIN;
-	case CEPH_FILE_MODE_RD:
-		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
-			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-	case CEPH_FILE_MODE_RDWR:
-		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
-			CEPH_CAP_FILE_EXCL |
-			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
-			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-	case CEPH_FILE_MODE_WR:
-		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
-			CEPH_CAP_FILE_EXCL |
-			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-	}
-	return 0;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index 2fa992eaf7d..00000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-
-#ifndef _FS_CEPH_CEPH_FS_H
-#define _FS_CEPH_CEPH_FS_H
-
-#include "msgr.h"
-#include "rados.h"
-
-/*
- * Ceph release version
- */
-#define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 20
-#define CEPH_VERSION_PATCH 0
-
-#define _CEPH_STRINGIFY(x) #x
-#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
-#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
-	"." CEPH_STRINGIFY(z)
-#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
-				       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
-
-/*
- * subprotocol versions.  when specific messages types or high-level
- * protocols change, bump the affected components.  we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   24 /* server/client */
-#define CEPH_MDSC_PROTOCOL   32 /* server/client */
-#define CEPH_MONC_PROTOCOL   15 /* server/client */
-
-
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
-
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON   31
-
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID        1
-#define CEPH_FEATURE_NOSRCADDR  2
-#define CEPH_FEATURE_FLOCK      4
-
-#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
-
-
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
-	/* file -> object mapping */
-	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
-				      of page size. */
-	__le32 fl_stripe_count;    /* over this many objects */
-	__le32 fl_object_size;     /* until objects are this big, then move to
-				      new objects */
-	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
-
-	/* pg -> disk layout */
-	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
-
-	/* object -> pg layout */
-	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-
-#define CEPH_MIN_STRIPE_UNIT 65536
-
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
-
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES  0x1
-
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN	0x0
-#define CEPH_AUTH_NONE	 	0x1
-#define CEPH_AUTH_CEPHX	 	0x2
-
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-
-
-/*********************************************
- * message layer
- */
-
-/*
- * message types
- */
-
-/* misc */
-#define CEPH_MSG_SHUTDOWN               1
-#define CEPH_MSG_PING                   2
-
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP                4
-#define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_STATFS                 13
-#define CEPH_MSG_STATFS_REPLY           14
-#define CEPH_MSG_MON_SUBSCRIBE          15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
-#define CEPH_MSG_AUTH			17
-#define CEPH_MSG_AUTH_REPLY		18
-
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP                21
-
-#define CEPH_MSG_CLIENT_SESSION         22
-#define CEPH_MSG_CLIENT_RECONNECT       23
-
-#define CEPH_MSG_CLIENT_REQUEST         24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY           26
-#define CEPH_MSG_CLIENT_CAPS            0x310
-#define CEPH_MSG_CLIENT_LEASE           0x311
-#define CEPH_MSG_CLIENT_SNAP            0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
-
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY           48
-#define CEPH_MSG_POOLOP                 49
-
-
-/* osd */
-#define CEPH_MSG_OSD_MAP          41
-#define CEPH_MSG_OSD_OP           42
-#define CEPH_MSG_OSD_OPREPLY      43
-
-/* pool operations */
-enum {
-  POOL_OP_CREATE			= 0x01,
-  POOL_OP_DELETE			= 0x02,
-  POOL_OP_AUID_CHANGE			= 0x03,
-  POOL_OP_CREATE_SNAP			= 0x11,
-  POOL_OP_DELETE_SNAP			= 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
-};
-
-struct ceph_mon_request_header {
-	__le64 have_version;
-	__le16 session_mon;
-	__le64 session_mon_tid;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_statfs {
-	__le64 kb, kb_used, kb_avail;
-	__le64 num_objects;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs_reply {
-	struct ceph_fsid fsid;
-	__le64 version;
-	struct ceph_statfs st;
-} __attribute__ ((packed));
-
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 pool;
-	__le32 op;
-	__le64 auid;
-	__le64 snapid;
-	__le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 reply_code;
-	__le32 epoch;
-	char has_data;
-	char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
-	__le64 snapid;
-} __attribute__ ((packed));
-
-struct ceph_osd_getmap {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-	__le32 start;
-} __attribute__ ((packed));
-
-struct ceph_mds_getmap {
-	struct ceph_mon_request_header monhdr;
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_client_mount {
-	struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_item {
-	__le64 have_version;	__le64 have;
-	__u8 onetime;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_ack {
-	__le32 duration;         /* seconds */
-	struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-/*
- * mds states
- *   > 0 -> in
- *  <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
-					  empty log. */
-#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-
-#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
-					  operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
-
-extern const char *ceph_mds_state_name(int s);
-
-
-/*
- * metadata lock types.
- *  - these are bitmasks.. we can compose them
- *  - they also define the lock ordering by the MDS
- *  - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_IXATTR      2048
-#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
-
-/* client_session ops */
-enum {
-	CEPH_SESSION_REQUEST_OPEN,
-	CEPH_SESSION_OPEN,
-	CEPH_SESSION_REQUEST_CLOSE,
-	CEPH_SESSION_CLOSE,
-	CEPH_SESSION_REQUEST_RENEWCAPS,
-	CEPH_SESSION_RENEWCAPS,
-	CEPH_SESSION_STALE,
-	CEPH_SESSION_RECALL_STATE,
-};
-
-extern const char *ceph_session_op_name(int op);
-
-struct ceph_mds_session_head {
-	__le32 op;
-	__le64 seq;
-	struct ceph_timespec stamp;
-	__le32 max_caps, max_leases;
-} __attribute__ ((packed));
-
-/* client_request */
-/*
- * metadata ops.
- *  & 0x001000 -> write op
- *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- &  & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE        0x001000
-enum {
-	CEPH_MDS_OP_LOOKUP     = 0x00100,
-	CEPH_MDS_OP_GETATTR    = 0x00101,
-	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
-	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-
-	CEPH_MDS_OP_SETXATTR   = 0x01105,
-	CEPH_MDS_OP_RMXATTR    = 0x01106,
-	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
-	CEPH_MDS_OP_SETATTR    = 0x01108,
-
-	CEPH_MDS_OP_MKNOD      = 0x01201,
-	CEPH_MDS_OP_LINK       = 0x01202,
-	CEPH_MDS_OP_UNLINK     = 0x01203,
-	CEPH_MDS_OP_RENAME     = 0x01204,
-	CEPH_MDS_OP_MKDIR      = 0x01220,
-	CEPH_MDS_OP_RMDIR      = 0x01221,
-	CEPH_MDS_OP_SYMLINK    = 0x01222,
-
-	CEPH_MDS_OP_CREATE     = 0x01301,
-	CEPH_MDS_OP_OPEN       = 0x00302,
-	CEPH_MDS_OP_READDIR    = 0x00305,
-
-	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
-	CEPH_MDS_OP_MKSNAP     = 0x01400,
-	CEPH_MDS_OP_RMSNAP     = 0x01401,
-	CEPH_MDS_OP_LSSNAP     = 0x00402,
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
-
-union ceph_mds_request_args {
-	struct {
-		__le32 mask;                 /* CEPH_CAP_* */
-	} __attribute__ ((packed)) getattr;
-	struct {
-		__le32 mode;
-		__le32 uid;
-		__le32 gid;
-		struct ceph_timespec mtime;
-		struct ceph_timespec atime;
-		__le64 size, old_size;       /* old_size needed by truncate */
-		__le32 mask;                 /* CEPH_SETATTR_* */
-	} __attribute__ ((packed)) setattr;
-	struct {
-		__le32 frag;                 /* which dir fragment */
-		__le32 max_entries;          /* how many dentries to grab */
-		__le32 max_bytes;
-	} __attribute__ ((packed)) readdir;
-	struct {
-		__le32 mode;
-		__le32 rdev;
-	} __attribute__ ((packed)) mknod;
-	struct {
-		__le32 mode;
-	} __attribute__ ((packed)) mkdir;
-	struct {
-		__le32 flags;
-		__le32 mode;
-		__le32 stripe_unit;          /* layout for newly created file */
-		__le32 stripe_count;         /* ... */
-		__le32 object_size;
-		__le32 file_replication;
-		__le32 preferred;
-	} __attribute__ ((packed)) open;
-	struct {
-		__le32 flags;
-	} __attribute__ ((packed)) setxattr;
-	struct {
-		struct ceph_file_layout layout;
-	} __attribute__ ((packed)) setlayout;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
-
-struct ceph_mds_request_head {
-	__le64 oldest_client_tid;
-	__le32 mdsmap_epoch;           /* on client */
-	__le32 flags;                  /* CEPH_MDS_FLAG_* */
-	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-	__le16 num_releases;           /* # include cap/lease release records */
-	__le32 op;                     /* mds op code */
-	__le32 caller_uid, caller_gid;
-	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
-					  etc. (if replaying) */
-	union ceph_mds_request_args args;
-} __attribute__ ((packed));
-
-/* cap/lease release record */
-struct ceph_mds_request_release {
-	__le64 ino, cap_id;            /* ino and unique cap id */
-	__le32 caps, wanted;           /* new issued, wanted */
-	__le32 seq, issue_seq, mseq;
-	__le32 dname_seq;              /* if releasing a dentry lease, a */
-	__le32 dname_len;              /* string follows. */
-} __attribute__ ((packed));
-
-/* client reply */
-struct ceph_mds_reply_head {
-	__le32 op;
-	__le32 result;
-	__le32 mdsmap_epoch;
-	__u8 safe;                     /* true if committed to disk */
-	__u8 is_dentry, is_target;     /* true if dentry, target inode records
-					  are included with reply */
-} __attribute__ ((packed));
-
-/* one for each node split */
-struct ceph_frag_tree_split {
-	__le32 frag;                   /* this frag splits... */
-	__le32 by;                     /* ...by this many bits */
-} __attribute__ ((packed));
-
-struct ceph_frag_tree_head {
-	__le32 nsplits;                /* num ceph_frag_tree_split records */
-	struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
-	__le32 caps, wanted;           /* caps issued, wanted */
-	__le64 cap_id;
-	__le32 seq, mseq;
-	__le64 realm;                  /* snap realm */
-	__u8 flags;                    /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
-
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
-	__le64 ino;
-	__le64 snapid;
-	__le32 rdev;
-	__le64 version;                /* inode version */
-	__le64 xattr_version;          /* version for xattr blob */
-	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-	struct ceph_file_layout layout;
-	struct ceph_timespec ctime, mtime, atime;
-	__le32 time_warp_seq;
-	__le64 size, max_size, truncate_size;
-	__le32 truncate_seq;
-	__le32 mode, uid, gid;
-	__le32 nlink;
-	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
-	struct ceph_timespec rctime;
-	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
-	__le16 mask;            /* lease type(s) */
-	__le32 duration_ms;     /* lease duration */
-	__le32 seq;
-} __attribute__ ((packed));
-
-struct ceph_mds_reply_dirfrag {
-	__le32 frag;            /* fragment */
-	__le32 auth;            /* auth mds, if this is a delegation point */
-	__le32 ndist;           /* number of mds' this is replicated on */
-	__le32 dist[];
-} __attribute__ ((packed));
-
-/* file access modes */
-#define CEPH_FILE_MODE_PIN        0
-#define CEPH_FILE_MODE_RD         1
-#define CEPH_FILE_MODE_WR         2
-#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
-#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
-
-int ceph_flags_to_mode(int flags);
-
-
-/* capability bits */
-#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
-
-/* generic cap bits */
-#define CEPH_CAP_GSHARED     1  /* client can reads */
-#define CEPH_CAP_GEXCL       2  /* client can read and update */
-#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
-#define CEPH_CAP_GRD         8  /* (file) client can read */
-#define CEPH_CAP_GWR        16  /* (file) client can write */
-#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
-
-/* per-lock shift */
-#define CEPH_CAP_SAUTH      2
-#define CEPH_CAP_SLINK      4
-#define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
-
-#define CEPH_CAP_BITS       16
-
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
-
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
-#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
-				 CEPH_CAP_AUTH_SHARED |	\
-				 CEPH_CAP_LINK_SHARED |	\
-				 CEPH_CAP_FILE_SHARED |	\
-				 CEPH_CAP_XATTR_SHARED)
-
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
-			      CEPH_CAP_LINK_SHARED |			\
-			      CEPH_CAP_XATTR_SHARED |			\
-			      CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
-			   CEPH_CAP_FILE_CACHE)
-
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
-			   CEPH_CAP_LINK_EXCL |		\
-			   CEPH_CAP_XATTR_EXCL |	\
-			   CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
-			      CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
-
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
-			CEPH_LOCK_IXATTR)
-
-int ceph_caps_for_mode(int mode);
-
-enum {
-	CEPH_CAP_OP_GRANT,         /* mds->client grant */
-	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
-	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
-	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
-	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
-	CEPH_CAP_OP_UPDATE,        /* client->mds update */
-	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
-	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
-	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
-	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
-	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
-	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
-	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
-};
-
-extern const char *ceph_cap_op_name(int op);
-
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
-	__le32 op;                  /* CEPH_CAP_OP_* */
-	__le64 ino, realm;
-	__le64 cap_id;
-	__le32 seq, issue_seq;
-	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
-	__le32 migrate_seq;
-	__le64 snap_follows;
-	__le32 snap_trace_len;
-
-	/* authlock */
-	__le32 uid, gid, mode;
-
-	/* linklock */
-	__le32 nlink;
-
-	/* xattrlock */
-	__le32 xattr_len;
-	__le64 xattr_version;
-
-	/* filelock */
-	__le64 size, max_size, truncate_size;
-	__le32 truncate_seq;
-	struct ceph_timespec mtime, atime, ctime;
-	struct ceph_file_layout layout;
-	__le32 time_warp_seq;
-} __attribute__ ((packed));
-
-/* cap release msg head */
-struct ceph_mds_cap_release {
-	__le32 num;                /* number of cap_items that follow */
-} __attribute__ ((packed));
-
-struct ceph_mds_cap_item {
-	__le64 ino;
-	__le64 cap_id;
-	__le32 migrate_seq, seq;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
-#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
-#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
-
-extern const char *ceph_lease_op_name(int o);
-
-/* lease msg header */
-struct ceph_mds_lease {
-	__u8 action;            /* CEPH_MDS_LEASE_* */
-	__le16 mask;            /* which lease */
-	__le64 ino;
-	__le64 first, last;     /* snap range */
-	__le32 seq;
-	__le32 duration_ms;     /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
-	__le64 cap_id;
-	__le32 wanted;
-	__le32 issued;
-	__le64 size;
-	struct ceph_timespec mtime, atime;
-	__le64 snaprealm;
-	__le64 pathbase;        /* base ino for our path to this ino */
-} __attribute__ ((packed));
-/* followed by encoded string */
-
-struct ceph_mds_snaprealm_reconnect {
-	__le64 ino;     /* snap realm base */
-	__le64 seq;     /* snap seq for this snap realm */
-	__le64 parent;  /* parent realm */
-} __attribute__ ((packed));
-
-/*
- * snaps
- */
-enum {
-	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
-	CEPH_SNAP_OP_CREATE,
-	CEPH_SNAP_OP_DESTROY,
-	CEPH_SNAP_OP_SPLIT,
-};
-
-extern const char *ceph_snap_op_name(int o);
-
-/* snap msg header */
-struct ceph_mds_snap_head {
-	__le32 op;                /* CEPH_SNAP_OP_* */
-	__le64 split;             /* ino to split off, if any */
-	__le32 num_split_inos;    /* # inos belonging to new child realm */
-	__le32 num_split_realms;  /* # child realms udner new child realm */
-	__le32 trace_len;         /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
-	__le64 ino;           /* ino */
-	__le64 created;       /* snap: when created */
-	__le64 parent;        /* ino: parent realm */
-	__le64 parent_since;  /* snap: same parent since */
-	__le64 seq;           /* snap: version */
-	__le32 num_snaps;
-	__le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d14..00000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-
-#include "types.h"
-
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)						\
-	do {							\
-		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
-	} while (0)
-
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
-	const unsigned char *k = (const unsigned char *)str;
-	__u32 a, b, c;  /* the internal state */
-	__u32 len;      /* how many key bytes still need mixing */
-
-	/* Set up the internal state */
-	len = length;
-	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-	b = a;
-	c = 0;               /* variable initialization of internal state */
-
-	/* handle most of the key */
-	while (len >= 12) {
-		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-			 ((__u32)k[3] << 24));
-		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-			 ((__u32)k[7] << 24));
-		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-			 ((__u32)k[11] << 24));
-		mix(a, b, c);
-		k = k + 12;
-		len = len - 12;
-	}
-
-	/* handle the last 11 bytes */
-	c = c + length;
-	switch (len) {            /* all the case statements fall through */
-	case 11:
-		c = c + ((__u32)k[10] << 24);
-	case 10:
-		c = c + ((__u32)k[9] << 16);
-	case 9:
-		c = c + ((__u32)k[8] << 8);
-		/* the first byte of c is reserved for the length */
-	case 8:
-		b = b + ((__u32)k[7] << 24);
-	case 7:
-		b = b + ((__u32)k[6] << 16);
-	case 6:
-		b = b + ((__u32)k[5] << 8);
-	case 5:
-		b = b + k[4];
-	case 4:
-		a = a + ((__u32)k[3] << 24);
-	case 3:
-		a = a + ((__u32)k[2] << 16);
-	case 2:
-		a = a + ((__u32)k[1] << 8);
-	case 1:
-		a = a + k[0];
-		/* case 0: nothing left to add */
-	}
-	mix(a, b, c);
-
-	return c;
-}
-
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
-	unsigned long hash = 0;
-	unsigned char c;
-
-	while (length--) {
-		c = *str++;
-		hash = (hash + (c << 4) + (c >> 4)) * 11;
-	}
-	return hash;
-}
-
-
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
-	switch (type) {
-	case CEPH_STR_HASH_LINUX:
-		return ceph_str_hash_linux(s, len);
-	case CEPH_STR_HASH_RJENKINS:
-		return ceph_str_hash_rjenkins(s, len);
-	default:
-		return -1;
-	}
-}
-
-const char *ceph_str_hash_name(int type)
-{
-	switch (type) {
-	case CEPH_STR_HASH_LINUX:
-		return "linux";
-	case CEPH_STR_HASH_RJENKINS:
-		return "rjenkins";
-	default:
-		return "unknown";
-	}
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index 5ac470c433c..00000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FS_CEPH_HASH_H
-#define _FS_CEPH_HASH_H
-
-#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
-
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-
-#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e577..00000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-
-#include "crush.h"
-
-const char *crush_bucket_alg_name(int alg)
-{
-	switch (alg) {
-	case CRUSH_BUCKET_UNIFORM: return "uniform";
-	case CRUSH_BUCKET_LIST: return "list";
-	case CRUSH_BUCKET_TREE: return "tree";
-	case CRUSH_BUCKET_STRAW: return "straw";
-	default: return "unknown";
-	}
-}
-
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
-	if (p >= b->size)
-		return 0;
-
-	switch (b->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		return ((struct crush_bucket_uniform *)b)->item_weight;
-	case CRUSH_BUCKET_LIST:
-		return ((struct crush_bucket_list *)b)->item_weights[p];
-	case CRUSH_BUCKET_TREE:
-		if (p & 1)
-			return ((struct crush_bucket_tree *)b)->node_weights[p];
-		return 0;
-	case CRUSH_BUCKET_STRAW:
-		return ((struct crush_bucket_straw *)b)->item_weights[p];
-	}
-	return 0;
-}
-
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-	int i, b, c;
-
-	for (b = 0; b < map->max_buckets; b++) {
-		if (map->buckets[b] == NULL)
-			continue;
-		for (i = 0; i < map->buckets[b]->size; i++) {
-			c = map->buckets[b]->items[i];
-			BUG_ON(c >= map->max_devices ||
-			       c < -map->max_buckets);
-			if (c >= 0)
-				map->device_parents[c] = map->buckets[b]->id;
-			else
-				map->bucket_parents[-1-c] = map->buckets[b]->id;
-		}
-	}
-}
-
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
-	kfree(b->item_weights);
-	kfree(b->sum_weights);
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
-	kfree(b->node_weights);
-	kfree(b);
-}
-
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
-	kfree(b->straws);
-	kfree(b->item_weights);
-	kfree(b->h.perm);
-	kfree(b->h.items);
-	kfree(b);
-}
-
-void crush_destroy_bucket(struct crush_bucket *b)
-{
-	switch (b->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
-		break;
-	case CRUSH_BUCKET_LIST:
-		crush_destroy_bucket_list((struct crush_bucket_list *)b);
-		break;
-	case CRUSH_BUCKET_TREE:
-		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
-		break;
-	case CRUSH_BUCKET_STRAW:
-		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
-		break;
-	}
-}
-
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
-	int b;
-
-	/* buckets */
-	if (map->buckets) {
-		for (b = 0; b < map->max_buckets; b++) {
-			if (map->buckets[b] == NULL)
-				continue;
-			crush_destroy_bucket(map->buckets[b]);
-		}
-		kfree(map->buckets);
-	}
-
-	/* rules */
-	if (map->rules) {
-		for (b = 0; b < map->max_rules; b++)
-			kfree(map->rules[b]);
-		kfree(map->rules);
-	}
-
-	kfree(map->bucket_parents);
-	kfree(map->device_parents);
-	kfree(map);
-}
-
-
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index dcd7e752370..00000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef _CRUSH_CRUSH_H
-#define _CRUSH_CRUSH_H
-
-#include <linux/types.h>
-
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-
-
-#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
-
-
-#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_SET   10  /* max size of a mapping result */
-
-
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices.  A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
-	__u32 op;
-	__s32 arg1;
-	__s32 arg2;
-};
-
-/* step op codes */
-enum {
-	CRUSH_RULE_NOOP = 0,
-	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
-	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
-				      /* arg2 = type */
-	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
-	CRUSH_RULE_EMIT = 4,          /* no args */
-	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
-	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N            0
-#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
-
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
-	__u8 ruleset;
-	__u8 type;
-	__u8 min_size;
-	__u8 max_size;
-};
-
-struct crush_rule {
-	__u32 len;
-	struct crush_rule_mask mask;
-	struct crush_rule_step steps[0];
-};
-
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
-			      (len)*sizeof(struct crush_rule_step))
-
-
-
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).  Items within a bucket are chosen using one of a
- * few different algorithms.  The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- *  Bucket Alg     Speed       Additions    Removals
- *  ------------------------------------------------
- *  uniform         O(1)       poor         poor
- *  list            O(n)       optimal      poor
- *  tree            O(log n)   good         good
- *  straw           O(n)       optimal      optimal
- */
-enum {
-	CRUSH_BUCKET_UNIFORM = 1,
-	CRUSH_BUCKET_LIST = 2,
-	CRUSH_BUCKET_TREE = 3,
-	CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-
-struct crush_bucket {
-	__s32 id;        /* this'll be negative */
-	__u16 type;      /* non-zero; type=0 is reserved for devices */
-	__u8 alg;        /* one of CRUSH_BUCKET_* */
-	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
-	__u32 weight;    /* 16-bit fixed point */
-	__u32 size;      /* num items */
-	__s32 *items;
-
-	/*
-	 * cached random permutation: used for uniform bucket and for
-	 * the linear search fallback for the other bucket types.
-	 */
-	__u32 perm_x;  /* @x for which *perm is defined */
-	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
-	__u32 *perm;
-};
-
-struct crush_bucket_uniform {
-	struct crush_bucket h;
-	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
-};
-
-struct crush_bucket_list {
-	struct crush_bucket h;
-	__u32 *item_weights;  /* 16-bit fixed point */
-	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
-				 of weights 0..i, inclusive */
-};
-
-struct crush_bucket_tree {
-	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
-				   actual items */
-	__u8 num_nodes;
-	__u32 *node_weights;
-};
-
-struct crush_bucket_straw {
-	struct crush_bucket h;
-	__u32 *item_weights;   /* 16-bit fixed point */
-	__u32 *straws;         /* 16-bit fixed point */
-};
-
-
-
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
-	struct crush_bucket **buckets;
-	struct crush_rule **rules;
-
-	/*
-	 * Parent pointers to identify the parent bucket a device or
-	 * bucket in the hierarchy.  If an item appears more than
-	 * once, this is the _last_ time it appeared (where buckets
-	 * are processed in bucket id order, from -1 on down to
-	 * -max_buckets.
-	 */
-	__u32 *bucket_parents;
-	__u32 *device_parents;
-
-	__s32 max_buckets;
-	__u32 max_rules;
-	__s32 max_devices;
-};
-
-
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694b..00000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#include <linux/types.h>
-#include "hash.h"
-
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {			\
-		a = a-b;  a = a-c;  a = a^(c>>13);	\
-		b = b-c;  b = b-a;  b = b^(a<<8);	\
-		c = c-a;  c = c-b;  c = c^(b>>13);	\
-		a = a-b;  a = a-c;  a = a^(c>>12);	\
-		b = b-c;  b = b-a;  b = b^(a<<16);	\
-		c = c-a;  c = c-b;  c = c^(b>>5);	\
-		a = a-b;  a = a-c;  a = a^(c>>3);	\
-		b = b-c;  b = b-a;  b = b^(a<<10);	\
-		c = c-a;  c = c-b;  c = c^(b>>15);	\
-	} while (0)
-
-#define crush_hash_seed 1315423911
-
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
-	__u32 hash = crush_hash_seed ^ a;
-	__u32 b = a;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, a, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(x, a, hash);
-	crush_hashmix(b, y, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(a, x, hash);
-	crush_hashmix(y, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, d, hash);
-	return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
-				      __u32 e)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(e, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	crush_hashmix(d, x, hash);
-	crush_hashmix(y, e, hash);
-	return hash;
-}
-
-
-__u32 crush_hash32(int type, __u32 a)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1(a);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_2(a, b);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_3(a, b, c);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_4(a, b, c, d);
-	default:
-		return 0;
-	}
-}
-
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return crush_hash32_rjenkins1_5(a, b, c, d, e);
-	default:
-		return 0;
-	}
-}
-
-const char *crush_hash_name(int type)
-{
-	switch (type) {
-	case CRUSH_HASH_RJENKINS1:
-		return "rjenkins1";
-	default:
-		return "unknown";
-	}
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index ff48e110e4b..00000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _CRUSH_HASH_H
-#define _CRUSH_HASH_H
-
-#define CRUSH_HASH_RJENKINS1   0
-
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-
-extern const char *crush_hash_name(int type);
-
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
-			    __u32 e);
-
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258..00000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-
-#include "crush.h"
-#include "hash.h"
-
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
-	int i;
-
-	for (i = 0; i < map->max_rules; i++) {
-		if (map->rules[i] &&
-		    map->rules[i]->mask.ruleset == ruleset &&
-		    map->rules[i]->mask.type == type &&
-		    map->rules[i]->mask.min_size <= size &&
-		    map->rules[i]->mask.max_size >= size)
-			return i;
-	}
-	return -1;
-}
-
-
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors.  Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
-			      int x, int r)
-{
-	unsigned pr = r % bucket->size;
-	unsigned i, s;
-
-	/* start a new permutation if @x has changed */
-	if (bucket->perm_x != x || bucket->perm_n == 0) {
-		dprintk("bucket %d new x=%d\n", bucket->id, x);
-		bucket->perm_x = x;
-
-		/* optimize common r=0 case */
-		if (pr == 0) {
-			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
-				bucket->size;
-			bucket->perm[0] = s;
-			bucket->perm_n = 0xffff;   /* magic value, see below */
-			goto out;
-		}
-
-		for (i = 0; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm_n = 0;
-	} else if (bucket->perm_n == 0xffff) {
-		/* clean up after the r=0 case above */
-		for (i = 1; i < bucket->size; i++)
-			bucket->perm[i] = i;
-		bucket->perm[bucket->perm[0]] = 0;
-		bucket->perm_n = 1;
-	}
-
-	/* calculate permutation up to pr */
-	for (i = 0; i < bucket->perm_n; i++)
-		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-	while (bucket->perm_n <= pr) {
-		unsigned p = bucket->perm_n;
-		/* no point in swapping the final entry */
-		if (p < bucket->size - 1) {
-			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
-				(bucket->size - p);
-			if (i) {
-				unsigned t = bucket->perm[p + i];
-				bucket->perm[p + i] = bucket->perm[p];
-				bucket->perm[p] = t;
-			}
-			dprintk(" perm_choose swap %d with %d\n", p, p+i);
-		}
-		bucket->perm_n++;
-	}
-	for (i = 0; i < bucket->size; i++)
-		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
-
-	s = bucket->perm[pr];
-out:
-	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
-		bucket->size, x, r, pr, s);
-	return bucket->items[s];
-}
-
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-				 int x, int r)
-{
-	return bucket_perm_choose(&bucket->h, x, r);
-}
-
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
-			      int x, int r)
-{
-	int i;
-
-	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
-					 r, bucket->h.id);
-		w &= 0xffff;
-		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
-			"sw %x rand %llx",
-			i, x, r, bucket->h.items[i], bucket->item_weights[i],
-			bucket->sum_weights[i], w);
-		w *= bucket->sum_weights[i];
-		w = w >> 16;
-		/*dprintk(" scaled %llx\n", w);*/
-		if (w < bucket->item_weights[i])
-			return bucket->h.items[i];
-	}
-
-	BUG_ON(1);
-	return 0;
-}
-
-
-/* (binary) tree */
-static int height(int n)
-{
-	int h = 0;
-	while ((n & 1) == 0) {
-		h++;
-		n = n >> 1;
-	}
-	return h;
-}
-
-static int left(int x)
-{
-	int h = height(x);
-	return x - (1 << (h-1));
-}
-
-static int right(int x)
-{
-	int h = height(x);
-	return x + (1 << (h-1));
-}
-
-static int terminal(int x)
-{
-	return x & 1;
-}
-
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
-			      int x, int r)
-{
-	int n, l;
-	__u32 w;
-	__u64 t;
-
-	/* start at root */
-	n = bucket->num_nodes >> 1;
-
-	while (!terminal(n)) {
-		/* pick point in [0, w) */
-		w = bucket->node_weights[n];
-		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
-					  bucket->h.id) * (__u64)w;
-		t = t >> 32;
-
-		/* descend to the left or right? */
-		l = left(n);
-		if (t < bucket->node_weights[l])
-			n = l;
-		else
-			n = right(n);
-	}
-
-	return bucket->h.items[n >> 1];
-}
-
-
-/* straw */
-
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
-			       int x, int r)
-{
-	int i;
-	int high = 0;
-	__u64 high_draw = 0;
-	__u64 draw;
-
-	for (i = 0; i < bucket->h.size; i++) {
-		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
-		draw &= 0xffff;
-		draw *= bucket->straws[i];
-		if (i == 0 || draw > high_draw) {
-			high = i;
-			high_draw = draw;
-		}
-	}
-	return bucket->h.items[high];
-}
-
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
-	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
-	switch (in->alg) {
-	case CRUSH_BUCKET_UNIFORM:
-		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-					  x, r);
-	case CRUSH_BUCKET_LIST:
-		return bucket_list_choose((struct crush_bucket_list *)in,
-					  x, r);
-	case CRUSH_BUCKET_TREE:
-		return bucket_tree_choose((struct crush_bucket_tree *)in,
-					  x, r);
-	case CRUSH_BUCKET_STRAW:
-		return bucket_straw_choose((struct crush_bucket_straw *)in,
-					   x, r);
-	default:
-		BUG_ON(1);
-		return in->items[0];
-	}
-}
-
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
-	if (weight[item] >= 0x10000)
-		return 0;
-	if (weight[item] == 0)
-		return 1;
-	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
-	    < weight[item])
-		return 0;
-	return 1;
-}
-
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
-			struct crush_bucket *bucket,
-			__u32 *weight,
-			int x, int numrep, int type,
-			int *out, int outpos,
-			int firstn, int recurse_to_leaf,
-			int *out2)
-{
-	int rep;
-	int ftotal, flocal;
-	int retry_descent, retry_bucket, skip_rep;
-	struct crush_bucket *in = bucket;
-	int r;
-	int i;
-	int item = 0;
-	int itemtype;
-	int collide, reject;
-	const int orig_tries = 5; /* attempts before we fall back to search */
-
-	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-		bucket->id, x, outpos, numrep);
-
-	for (rep = outpos; rep < numrep; rep++) {
-		/* keep trying until we get a non-out, non-colliding item */
-		ftotal = 0;
-		skip_rep = 0;
-		do {
-			retry_descent = 0;
-			in = bucket;               /* initial bucket */
-
-			/* choose through intervening buckets */
-			flocal = 0;
-			do {
-				collide = 0;
-				retry_bucket = 0;
-				r = rep;
-				if (in->alg == CRUSH_BUCKET_UNIFORM) {
-					/* be careful */
-					if (firstn || numrep >= in->size)
-						/* r' = r + f_total */
-						r += ftotal;
-					else if (in->size % numrep == 0)
-						/* r'=r+(n+1)*f_local */
-						r += (numrep+1) *
-							(flocal+ftotal);
-					else
-						/* r' = r + n*f_local */
-						r += numrep * (flocal+ftotal);
-				} else {
-					if (firstn)
-						/* r' = r + f_total */
-						r += ftotal;
-					else
-						/* r' = r + n*f_local */
-						r += numrep * (flocal+ftotal);
-				}
-
-				/* bucket choose */
-				if (in->size == 0) {
-					reject = 1;
-					goto reject;
-				}
-				if (flocal >= (in->size>>1) &&
-				    flocal > orig_tries)
-					item = bucket_perm_choose(in, x, r);
-				else
-					item = crush_bucket_choose(in, x, r);
-				BUG_ON(item >= map->max_devices);
-
-				/* desired type? */
-				if (item < 0)
-					itemtype = map->buckets[-1-item]->type;
-				else
-					itemtype = 0;
-				dprintk("  item %d type %d\n", item, itemtype);
-
-				/* keep going? */
-				if (itemtype != type) {
-					BUG_ON(item >= 0 ||
-					       (-1-item) >= map->max_buckets);
-					in = map->buckets[-1-item];
-					retry_bucket = 1;
-					continue;
-				}
-
-				/* collision? */
-				for (i = 0; i < outpos; i++) {
-					if (out[i] == item) {
-						collide = 1;
-						break;
-					}
-				}
-
-				reject = 0;
-				if (recurse_to_leaf) {
-					if (item < 0) {
-						if (crush_choose(map,
-							 map->buckets[-1-item],
-							 weight,
-							 x, outpos+1, 0,
-							 out2, outpos,
-							 firstn, 0,
-							 NULL) <= outpos)
-							/* didn't get leaf */
-							reject = 1;
-					} else {
-						/* we already have a leaf! */
-						out2[outpos] = item;
-					}
-				}
-
-				if (!reject) {
-					/* out? */
-					if (itemtype == 0)
-						reject = is_out(map, weight,
-								item, x);
-					else
-						reject = 0;
-				}
-
-reject:
-				if (reject || collide) {
-					ftotal++;
-					flocal++;
-
-					if (collide && flocal < 3)
-						/* retry locally a few times */
-						retry_bucket = 1;
-					else if (flocal < in->size + orig_tries)
-						/* exhaustive bucket search */
-						retry_bucket = 1;
-					else if (ftotal < 20)
-						/* then retry descent */
-						retry_descent = 1;
-					else
-						/* else give up */
-						skip_rep = 1;
-					dprintk("  reject %d  collide %d  "
-						"ftotal %d  flocal %d\n",
-						reject, collide, ftotal,
-						flocal);
-				}
-			} while (retry_bucket);
-		} while (retry_descent);
-
-		if (skip_rep) {
-			dprintk("skip rep\n");
-			continue;
-		}
-
-		dprintk("CHOOSE got %d\n", item);
-		out[outpos] = item;
-		outpos++;
-	}
-
-	dprintk("CHOOSE returns %d\n", outpos);
-	return outpos;
-}
-
-
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
-		  int ruleno, int x, int *result, int result_max,
-		  int force, __u32 *weight)
-{
-	int result_len;
-	int force_context[CRUSH_MAX_DEPTH];
-	int force_pos = -1;
-	int a[CRUSH_MAX_SET];
-	int b[CRUSH_MAX_SET];
-	int c[CRUSH_MAX_SET];
-	int recurse_to_leaf;
-	int *w;
-	int wsize = 0;
-	int *o;
-	int osize;
-	int *tmp;
-	struct crush_rule *rule;
-	int step;
-	int i, j;
-	int numrep;
-	int firstn;
-	int rc = -1;
-
-	BUG_ON(ruleno >= map->max_rules);
-
-	rule = map->rules[ruleno];
-	result_len = 0;
-	w = a;
-	o = b;
-
-	/*
-	 * determine hierarchical context of force, if any.  note
-	 * that this may or may not correspond to the specific types
-	 * referenced by the crush rule.
-	 */
-	if (force >= 0) {
-		if (force >= map->max_devices ||
-		    map->device_parents[force] == 0) {
-			/*dprintk("CRUSH: forcefed device dne\n");*/
-			rc = -1;  /* force fed device dne */
-			goto out;
-		}
-		if (!is_out(map, weight, force, x)) {
-			while (1) {
-				force_context[++force_pos] = force;
-				if (force >= 0)
-					force = map->device_parents[force];
-				else
-					force = map->bucket_parents[-1-force];
-				if (force == 0)
-					break;
-			}
-		}
-	}
-
-	for (step = 0; step < rule->len; step++) {
-		firstn = 0;
-		switch (rule->steps[step].op) {
-		case CRUSH_RULE_TAKE:
-			w[0] = rule->steps[step].arg1;
-			if (force_pos >= 0) {
-				BUG_ON(force_context[force_pos] != w[0]);
-				force_pos--;
-			}
-			wsize = 1;
-			break;
-
-		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
-		case CRUSH_RULE_CHOOSE_FIRSTN:
-			firstn = 1;
-		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
-		case CRUSH_RULE_CHOOSE_INDEP:
-			BUG_ON(wsize == 0);
-
-			recurse_to_leaf =
-				rule->steps[step].op ==
-				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-				rule->steps[step].op ==
-				CRUSH_RULE_CHOOSE_LEAF_INDEP;
-
-			/* reset output */
-			osize = 0;
-
-			for (i = 0; i < wsize; i++) {
-				/*
-				 * see CRUSH_N, CRUSH_N_MINUS macros.
-				 * basically, numrep <= 0 means relative to
-				 * the provided result_max
-				 */
-				numrep = rule->steps[step].arg1;
-				if (numrep <= 0) {
-					numrep += result_max;
-					if (numrep <= 0)
-						continue;
-				}
-				j = 0;
-				if (osize == 0 && force_pos >= 0) {
-					/* skip any intermediate types */
-					while (force_pos &&
-					       force_context[force_pos] < 0 &&
-					       rule->steps[step].arg2 !=
-					       map->buckets[-1 -
-					       force_context[force_pos]]->type)
-						force_pos--;
-					o[osize] = force_context[force_pos];
-					if (recurse_to_leaf)
-						c[osize] = force_context[0];
-					j++;
-					force_pos--;
-				}
-				osize += crush_choose(map,
-						      map->buckets[-1-w[i]],
-						      weight,
-						      x, numrep,
-						      rule->steps[step].arg2,
-						      o+osize, j,
-						      firstn,
-						      recurse_to_leaf, c+osize);
-			}
-
-			if (recurse_to_leaf)
-				/* copy final _leaf_ values to output set */
-				memcpy(o, c, osize*sizeof(*o));
-
-			/* swap t and w arrays */
-			tmp = o;
-			o = w;
-			w = tmp;
-			wsize = osize;
-			break;
-
-
-		case CRUSH_RULE_EMIT:
-			for (i = 0; i < wsize && result_len < result_max; i++) {
-				result[result_len] = w[i];
-				result_len++;
-			}
-			wsize = 0;
-			break;
-
-		default:
-			BUG_ON(1);
-		}
-	}
-	rc = result_len;
-
-out:
-	return rc;
-}
-
-
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index 98e90046fd9..00000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _CRUSH_MAPPER_H
-#define _CRUSH_MAPPER_H
-
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-
-#include "crush.h"
-
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
-			 int ruleno,
-			 int x, int *result, int result_max,
-			 int forcefeed,    /* -1 for none */
-			 __u32 *weights);
-
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index f704b3b6242..00000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,409 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-
-#include "crypto.h"
-#include "decode.h"
-
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
-	if (*p + sizeof(u16) + sizeof(key->created) +
-	    sizeof(u16) + key->len > end)
-		return -ERANGE;
-	ceph_encode_16(p, key->type);
-	ceph_encode_copy(p, &key->created, sizeof(key->created));
-	ceph_encode_16(p, key->len);
-	ceph_encode_copy(p, key->key, key->len);
-	return 0;
-}
-
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
-	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
-	key->type = ceph_decode_16(p);
-	ceph_decode_copy(p, &key->created, sizeof(key->created));
-	key->len = ceph_decode_16(p);
-	ceph_decode_need(p, end, key->len, bad);
-	key->key = kmalloc(key->len, GFP_NOFS);
-	if (!key->key)
-		return -ENOMEM;
-	ceph_decode_copy(p, key->key, key->len);
-	return 0;
-
-bad:
-	dout("failed to decode crypto key\n");
-	return -EINVAL;
-}
-
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
-	int inlen = strlen(inkey);
-	int blen = inlen * 3 / 4;
-	void *buf, *p;
-	int ret;
-
-	dout("crypto_key_unarmor %s\n", inkey);
-	buf = kmalloc(blen, GFP_NOFS);
-	if (!buf)
-		return -ENOMEM;
-	blen = ceph_unarmor(buf, inkey, inkey+inlen);
-	if (blen < 0) {
-		kfree(buf);
-		return blen;
-	}
-
-	p = buf;
-	ret = ceph_crypto_key_decode(key, &p, p + blen);
-	kfree(buf);
-	if (ret)
-		return ret;
-	dout("crypto_key_unarmor key %p type %d len %d\n", key,
-	     key->type, key->len);
-	return 0;
-}
-
-
-
-#define AES_KEY_SIZE 16
-
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
-	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-
-const u8 *aes_iv = "cephsageyudagreg";
-
-int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
-		     const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[2], sg_out[1];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-	int ret;
-	void *iv;
-	int ivsize;
-	size_t zero_padding = (0x10 - (src_len & 0x0f));
-	char pad[16];
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	memset(pad, zero_padding, zero_padding);
-
-	*dst_len = src_len + zero_padding;
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 2);
-	sg_set_buf(&sg_in[0], src, src_len);
-	sg_set_buf(&sg_in[1], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-	/*
-	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
-			src, src_len, 1);
-	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
-			pad, zero_padding, 1);
-	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-				     src_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
-		pr_err("ceph_aes_crypt failed %d\n", ret);
-	/*
-	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
-		      const void *src1, size_t src1_len,
-		      const void *src2, size_t src2_len)
-{
-	struct scatterlist sg_in[3], sg_out[1];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-	int ret;
-	void *iv;
-	int ivsize;
-	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
-	char pad[16];
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	memset(pad, zero_padding, zero_padding);
-
-	*dst_len = src1_len + src2_len + zero_padding;
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 3);
-	sg_set_buf(&sg_in[0], src1, src1_len);
-	sg_set_buf(&sg_in[1], src2, src2_len);
-	sg_set_buf(&sg_in[2], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-	/*
-	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
-			src1, src1_len, 1);
-	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
-			src2, src2_len, 1);
-	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
-			pad, zero_padding, 1);
-	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-				     src1_len + src2_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
-		pr_err("ceph_aes_crypt2 failed %d\n", ret);
-	/*
-	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
-		     const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[1], sg_out[2];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
-	char pad[16];
-	void *iv;
-	int ivsize;
-	int ret;
-	int last_byte;
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 1);
-	sg_init_table(sg_out, 2);
-	sg_set_buf(sg_in, src, src_len);
-	sg_set_buf(&sg_out[0], dst, *dst_len);
-	sg_set_buf(&sg_out[1], pad, sizeof(pad));
-
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-
-	/*
-	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
-		       src, src_len, 1);
-	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0) {
-		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
-	}
-
-	if (src_len <= *dst_len)
-		last_byte = ((char *)dst)[src_len - 1];
-	else
-		last_byte = pad[src_len - *dst_len - 1];
-	if (last_byte <= 16 && src_len >= last_byte) {
-		*dst_len = src_len - last_byte;
-	} else {
-		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-		       last_byte, (int)src_len);
-		return -EPERM;  /* bad padding */
-	}
-	/*
-	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst, *dst_len, 1);
-	*/
-	return 0;
-}
-
-int ceph_aes_decrypt2(const void *key, int key_len,
-		      void *dst1, size_t *dst1_len,
-		      void *dst2, size_t *dst2_len,
-		      const void *src, size_t src_len)
-{
-	struct scatterlist sg_in[1], sg_out[3];
-	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-	struct blkcipher_desc desc = { .tfm = tfm };
-	char pad[16];
-	void *iv;
-	int ivsize;
-	int ret;
-	int last_byte;
-
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	sg_init_table(sg_in, 1);
-	sg_set_buf(sg_in, src, src_len);
-	sg_init_table(sg_out, 3);
-	sg_set_buf(&sg_out[0], dst1, *dst1_len);
-	sg_set_buf(&sg_out[1], dst2, *dst2_len);
-	sg_set_buf(&sg_out[2], pad, sizeof(pad));
-
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	iv = crypto_blkcipher_crt(tfm)->iv;
-	ivsize = crypto_blkcipher_ivsize(tfm);
-
-	memcpy(iv, aes_iv, ivsize);
-
-	/*
-	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
-		       key, key_len, 1);
-	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
-		       src, src_len, 1);
-	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0) {
-		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
-	}
-
-	if (src_len <= *dst1_len)
-		last_byte = ((char *)dst1)[src_len - 1];
-	else if (src_len <= *dst1_len + *dst2_len)
-		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
-	else
-		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
-	if (last_byte <= 16 && src_len >= last_byte) {
-		src_len -= last_byte;
-	} else {
-		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-		       last_byte, (int)src_len);
-		return -EPERM;  /* bad padding */
-	}
-
-	if (src_len < *dst1_len) {
-		*dst1_len = src_len;
-		*dst2_len = 0;
-	} else {
-		*dst2_len = src_len - *dst1_len;
-	}
-	/*
-	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst1, *dst1_len, 1);
-	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
-		       dst2, *dst2_len, 1);
-	*/
-
-	return 0;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		 const void *src, size_t src_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src_len)
-			return -ERANGE;
-		memcpy(dst, src, src_len);
-		*dst_len = src_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_decrypt(secret->key, secret->len, dst,
-					dst_len, src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
-			void *dst1, size_t *dst1_len,
-			void *dst2, size_t *dst2_len,
-			const void *src, size_t src_len)
-{
-	size_t t;
-
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst1_len + *dst2_len < src_len)
-			return -ERANGE;
-		t = min(*dst1_len, src_len);
-		memcpy(dst1, src, t);
-		*dst1_len = t;
-		src += t;
-		src_len -= t;
-		if (src_len) {
-			t = min(*dst2_len, src_len);
-			memcpy(dst2, src, t);
-			*dst2_len = t;
-		}
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_decrypt2(secret->key, secret->len,
-					 dst1, dst1_len, dst2, dst2_len,
-					 src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		 const void *src, size_t src_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src_len)
-			return -ERANGE;
-		memcpy(dst, src, src_len);
-		*dst_len = src_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_encrypt(secret->key, secret->len, dst,
-					dst_len, src, src_len);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-		  const void *src1, size_t src1_len,
-		  const void *src2, size_t src2_len)
-{
-	switch (secret->type) {
-	case CEPH_CRYPTO_NONE:
-		if (*dst_len < src1_len + src2_len)
-			return -ERANGE;
-		memcpy(dst, src1, src1_len);
-		memcpy(dst + src1_len, src2, src2_len);
-		*dst_len = src1_len + src2_len;
-		return 0;
-
-	case CEPH_CRYPTO_AES:
-		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
-					 src1, src1_len, src2, src2_len);
-
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index 40b502e6bd8..00000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
-	int type;
-	struct ceph_timespec created;
-	int len;
-	void *key;
-};
-
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
-	kfree(key->key);
-}
-
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
-				  void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
-				  void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
-			void *dst, size_t *dst_len,
-			const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
-			void *dst, size_t *dst_len,
-			const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
-			void *dst1, size_t *dst1_len,
-			void *dst2, size_t *dst2_len,
-			const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
-			 void *dst, size_t *dst_len,
-			 const void *src1, size_t src1_len,
-			 const void *src2, size_t src2_len);
-
-/* armor.c */
-extern int ceph_armor(char *dst, const void *src, const void *end);
-extern int ceph_unarmor(void *dst, const char *src, const char *end);
-
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2f5332ddbb..7ae1b3d55b5 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/device.h>
 #include <linux/slab.h>
@@ -7,143 +7,49 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
 #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
 
 #ifdef CONFIG_DEBUG_FS
 
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-
-	if (client->monc.monmap == NULL)
-		return 0;
-
-	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-	for (i = 0; i < client->monc.monmap->num_mon; i++) {
-		struct ceph_entity_inst *inst =
-			&client->monc.monmap->mon_inst[i];
-
-		seq_printf(s, "\t%s%lld\t%s\n",
-			   ENTITY_NAME(inst->name),
-			   pr_addr(&inst->addr.in_addr));
-	}
-	return 0;
-}
+#include "mds_client.h"
 
 static int mdsmap_show(struct seq_file *s, void *p)
 {
 	int i;
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 
-	if (client->mdsc.mdsmap == NULL)
+	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
-	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
 	seq_printf(s, "session_timeout %d\n",
-		       client->mdsc.mdsmap->m_session_timeout);
+		       fsc->mdsc->mdsmap->m_session_timeout);
 	seq_printf(s, "session_autoclose %d\n",
-		       client->mdsc.mdsmap->m_session_autoclose);
-	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		       fsc->mdsc->mdsmap->m_session_autoclose);
+	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
 		struct ceph_entity_addr *addr =
-			&client->mdsc.mdsmap->m_info[i].addr;
-		int state = client->mdsc.mdsmap->m_info[i].state;
+			&fsc->mdsc->mdsmap->m_info[i].addr;
+		int state = fsc->mdsc->mdsmap->m_info[i].state;
 
-		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+			       ceph_pr_addr(&addr->in_addr),
 			       ceph_mds_state_name(state));
 	}
 	return 0;
 }
 
-static int osdmap_show(struct seq_file *s, void *p)
-{
-	int i;
-	struct ceph_client *client = s->private;
-	struct rb_node *n;
-
-	if (client->osdc.osdmap == NULL)
-		return 0;
-	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-	seq_printf(s, "flags%s%s\n",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-		   " NEARFULL" : "",
-		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-		   " FULL" : "");
-	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-		struct ceph_pg_pool_info *pool =
-			rb_entry(n, struct ceph_pg_pool_info, node);
-		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   pool->id, pool->v.pg_num, pool->pg_num_mask,
-			   pool->v.lpg_num, pool->lpg_num_mask);
-	}
-	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-		struct ceph_entity_addr *addr =
-			&client->osdc.osdmap->osd_addr[i];
-		int state = client->osdc.osdmap->osd_state[i];
-		char sb[64];
-
-		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-			   i, pr_addr(&addr->in_addr),
-			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-			   ceph_osdmap_state_str(sb, sizeof(sb), state));
-	}
-	return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_client *monc = &client->monc;
-	struct rb_node *rp;
-
-	mutex_lock(&monc->mutex);
-
-	if (monc->have_mdsmap)
-		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-	if (monc->have_osdmap)
-		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-	if (monc->want_next_osdmap)
-		seq_printf(s, "want next osdmap\n");
-
-	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-		__u16 op;
-		req = rb_entry(rp, struct ceph_mon_generic_request, node);
-		op = le16_to_cpu(req->request->hdr.type);
-		if (op == CEPH_MSG_STATFS)
-			seq_printf(s, "%lld statfs\n", req->tid);
-		else
-			seq_printf(s, "%lld unknown\n", req->tid);
-	}
-
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
 static int mdsc_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct rb_node *rp;
 	int pathlen;
@@ -171,6 +77,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		} else if (req->r_dentry) {
 			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
 						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
 			spin_lock(&req->r_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
 				   ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +95,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		if (req->r_old_dentry) {
 			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
 						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
 			spin_lock(&req->r_old_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
 			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
@@ -210,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
-{
-	struct ceph_client *client = s->private;
-	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
-		struct ceph_osd_request_head *head;
-		struct ceph_osd_op *op;
-		int num_ops;
-		int opcode, olen;
-		int i;
-
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   le32_to_cpu(req->r_pgid.pool),
-			   le16_to_cpu(req->r_pgid.ps));
-
-		head = req->r_request->front.iov_base;
-		op = (void *)(head + 1);
-
-		num_ops = le16_to_cpu(head->num_ops);
-		olen = le32_to_cpu(head->object_len);
-		seq_printf(s, "%.*s", olen,
-			   (const char *)(head->ops + num_ops));
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < num_ops; i++) {
-			opcode = le16_to_cpu(op->op);
-			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-			op++;
-		}
-
-		seq_printf(s, "\n");
-	}
-	mutex_unlock(&osdc->request_mutex);
-	return 0;
-}
-
 static int caps_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = s->private;
+	struct ceph_fs_client *fsc = s->private;
 	int total, avail, used, reserved, min;
 
-	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+	ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
 	seq_printf(s, "total\t\t%d\n"
 		   "avail\t\t%d\n"
 		   "used\t\t%d\n"
@@ -276,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
 
 static int dentry_lru_show(struct seq_file *s, void *ptr)
 {
-	struct ceph_client *client = s->private;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_dentry_info *di;
 
 	spin_lock(&mdsc->dentry_lru_lock);
@@ -291,199 +152,125 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
 	return 0;
 }
 
-#define DEFINE_SHOW_FUNC(name) 						\
-static int name##_open(struct inode *inode, struct file *file)		\
-{									\
-	struct seq_file *sf;						\
-	int ret;							\
-									\
-	ret = single_open(file, name, NULL);				\
-	sf = file->private_data;					\
-	sf->private = inode->i_private;					\
-	return ret;							\
-}									\
-									\
-static const struct file_operations name##_fops = {			\
-	.open		= name##_open,					\
-	.read		= seq_read,					\
-	.llseek		= seq_lseek,					\
-	.release	= single_release,				\
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
 
+/*
+ * debugfs
+ */
 static int congestion_kb_set(void *data, u64 val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		client->mount_args->congestion_kb = (int)val;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	fsc->mount_options->congestion_kb = (int)val;
 	return 0;
 }
 
 static int congestion_kb_get(void *data, u64 *val)
 {
-	struct ceph_client *client = (struct ceph_client *)data;
-
-	if (client)
-		*val = (u64)client->mount_args->congestion_kb;
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
 
+	*val = (u64)fsc->mount_options->congestion_kb;
 	return 0;
 }
 
-
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
 			congestion_kb_set, "%llu\n");
 
-int __init ceph_debugfs_init(void)
-{
-	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-	if (!ceph_debugfs_dir)
-		return -ENOMEM;
-	return 0;
-}
 
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-	debugfs_remove(ceph_debugfs_dir);
+	dout("ceph_fs_debugfs_cleanup\n");
+	debugfs_remove(fsc->debugfs_bdi);
+	debugfs_remove(fsc->debugfs_congestion_kb);
+	debugfs_remove(fsc->debugfs_mdsmap);
+	debugfs_remove(fsc->debugfs_caps);
+	debugfs_remove(fsc->debugfs_mdsc);
+	debugfs_remove(fsc->debugfs_dentry_lru);
 }
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	int ret = 0;
-	char name[80];
-
-	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
-		 PR_FSID(&client->fsid), client->monc.auth->global_id);
-
-	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-	if (!client->debugfs_dir)
+	char name[100];
+	int err = -ENOMEM;
+
+	dout("ceph_fs_debugfs_init\n");
+	fsc->debugfs_congestion_kb =
+		debugfs_create_file("writeback_congestion_kb",
+				    0600,
+				    fsc->client->debugfs_dir,
+				    fsc,
+				    &congestion_kb_fops);
+	if (!fsc->debugfs_congestion_kb)
 		goto out;
 
-	client->monc.debugfs_file = debugfs_create_file("monc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &monc_show_fops);
-	if (!client->monc.debugfs_file)
-		goto out;
-
-	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &mdsc_show_fops);
-	if (!client->mdsc.debugfs_file)
-		goto out;
+	dout("a\n");
 
-	client->osdc.debugfs_file = debugfs_create_file("osdc",
-						      0600,
-						      client->debugfs_dir,
-						      client,
-						      &osdc_show_fops);
-	if (!client->osdc.debugfs_file)
+	snprintf(name, sizeof(name), "../../bdi/%s",
+		 dev_name(fsc->backing_dev_info.dev));
+	fsc->debugfs_bdi =
+		debugfs_create_symlink("bdi",
+				       fsc->client->debugfs_dir,
+				       name);
+	if (!fsc->debugfs_bdi)
 		goto out;
 
-	client->debugfs_monmap = debugfs_create_file("monmap",
+	dout("b\n");
+	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0600,
-					client->debugfs_dir,
-					client,
-					&monmap_show_fops);
-	if (!client->debugfs_monmap)
-		goto out;
-
-	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
-					0600,
-					client->debugfs_dir,
-					client,
+					fsc->client->debugfs_dir,
+					fsc,
 					&mdsmap_show_fops);
-	if (!client->debugfs_mdsmap)
-		goto out;
-
-	client->debugfs_osdmap = debugfs_create_file("osdmap",
-					0600,
-					client->debugfs_dir,
-					client,
-					&osdmap_show_fops);
-	if (!client->debugfs_osdmap)
+	if (!fsc->debugfs_mdsmap)
 		goto out;
 
-	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-					0600,
-					client->debugfs_dir,
-					client,
-					&dentry_lru_show_fops);
-	if (!client->debugfs_dentry_lru)
+	dout("ca\n");
+	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+						0600,
+						fsc->client->debugfs_dir,
+						fsc,
+						&mdsc_show_fops);
+	if (!fsc->debugfs_mdsc)
 		goto out;
 
-	client->debugfs_caps = debugfs_create_file("caps",
+	dout("da\n");
+	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
-						   client->debugfs_dir,
-						   client,
+						   fsc->client->debugfs_dir,
+						   fsc,
 						   &caps_show_fops);
-	if (!client->debugfs_caps)
+	if (!fsc->debugfs_caps)
 		goto out;
 
-	client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
-						   0600,
-						   client->debugfs_dir,
-						   client,
-						   &congestion_kb_fops);
-	if (!client->debugfs_congestion_kb)
+	dout("ea\n");
+	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					fsc->client->debugfs_dir,
+					fsc,
+					&dentry_lru_show_fops);
+	if (!fsc->debugfs_dentry_lru)
 		goto out;
 
-	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-						     name);
-
 	return 0;
 
 out:
-	ceph_debugfs_client_cleanup(client);
-	return ret;
+	ceph_fs_debugfs_cleanup(fsc);
+	return err;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-	debugfs_remove(client->debugfs_bdi);
-	debugfs_remove(client->debugfs_caps);
-	debugfs_remove(client->debugfs_dentry_lru);
-	debugfs_remove(client->debugfs_osdmap);
-	debugfs_remove(client->debugfs_mdsmap);
-	debugfs_remove(client->debugfs_monmap);
-	debugfs_remove(client->osdc.debugfs_file);
-	debugfs_remove(client->mdsc.debugfs_file);
-	debugfs_remove(client->monc.debugfs_file);
-	debugfs_remove(client->debugfs_congestion_kb);
-	debugfs_remove(client->debugfs_dir);
-}
-
-#else  // CONFIG_DEBUG_FS
-
-int __init ceph_debugfs_init(void)
-{
-	return 0;
-}
 
-void ceph_debugfs_cleanup(void)
-{
-}
+#else  /* CONFIG_DEBUG_FS */
 
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	return 0;
 }
 
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 }
 
-#endif  // CONFIG_DEBUG_FS
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 65b3e022eaf..00000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-
-#include <asm/unaligned.h>
-#include <linux/time.h>
-
-#include "types.h"
-
-/*
- * in all cases,
- *   void **p     pointer to position pointer
- *   void *end    pointer to end of buffer (last byte + 1)
- */
-
-static inline u64 ceph_decode_64(void **p)
-{
-	u64 v = get_unaligned_le64(*p);
-	*p += sizeof(u64);
-	return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
-	u32 v = get_unaligned_le32(*p);
-	*p += sizeof(u32);
-	return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
-	u16 v = get_unaligned_le16(*p);
-	*p += sizeof(u16);
-	return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
-	u8 v = *(u8 *)*p;
-	(*p)++;
-	return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
-	memcpy(pv, *p, n);
-	*p += n;
-}
-
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad)		\
-	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
-			goto bad;			\
-	} while (0)
-
-#define ceph_decode_64_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u64), bad);	\
-		v = ceph_decode_64(p);				\
-	} while (0)
-#define ceph_decode_32_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u32), bad);	\
-		v = ceph_decode_32(p);				\
-	} while (0)
-#define ceph_decode_16_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u16), bad);	\
-		v = ceph_decode_16(p);				\
-	} while (0)
-#define ceph_decode_8_safe(p, end, v, bad)			\
-	do {							\
-		ceph_decode_need(p, end, sizeof(u8), bad);	\
-		v = ceph_decode_8(p);				\
-	} while (0)
-
-#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
-	do {							\
-		ceph_decode_need(p, end, n, bad);		\
-		ceph_decode_copy(p, pv, n);			\
-	} while (0)
-
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
-					const struct ceph_timespec *tv)
-{
-	ts->tv_sec = le32_to_cpu(tv->tv_sec);
-	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-					const struct timespec *ts)
-{
-	tv->tv_sec = cpu_to_le32(ts->tv_sec);
-	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
-	a->in_addr.ss_family = htons(a->in_addr.ss_family);
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
-	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
-	WARN_ON(a->in_addr.ss_family == 512);
-}
-
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
-	put_unaligned_le64(v, (__le64 *)*p);
-	*p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
-	put_unaligned_le32(v, (__le32 *)*p);
-	*p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
-	put_unaligned_le16(v, (__le16 *)*p);
-	*p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
-	*(u8 *)*p = v;
-	(*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
-	memcpy(*p, s, len);
-	*p += len;
-}
-
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
-					u64 ino, const char *path)
-{
-	u32 len = path ? strlen(path) : 0;
-	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
-	ceph_encode_8(p, 1);
-	ceph_encode_64(p, ino);
-	ceph_encode_32(p, len);
-	if (len)
-		memcpy(*p, path, len);
-	*p += len;
-}
-
-static inline void ceph_encode_string(void **p, void *end,
-				      const char *s, u32 len)
-{
-	BUG_ON(*p + sizeof(len) + len > end);
-	ceph_encode_32(p, len);
-	if (len)
-		memcpy(*p, s, len);
-	*p += len;
-}
-
-#define ceph_encode_need(p, end, n, bad)		\
-	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
-			goto bad;			\
-	} while (0)
-
-#define ceph_encode_64_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u64), bad);	\
-		ceph_encode_64(p, v);				\
-	} while (0)
-#define ceph_encode_32_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u32), bad);	\
-		ceph_encode_32(p, v);			\
-	} while (0)
-#define ceph_encode_16_safe(p, end, v, bad)			\
-	do {							\
-		ceph_encode_need(p, end, sizeof(u16), bad);	\
-		ceph_encode_16(p, v);			\
-	} while (0)
-
-#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
-	do {							\
-		ceph_encode_need(p, end, n, bad);		\
-		ceph_encode_copy(p, pv, n);			\
-	} while (0)
-
-
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f94ed3c7f6a..e0a2dc6fcaf 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * Directory operations: readdir, lookup, create, link, unlink,
@@ -27,7 +28,7 @@
 
 const struct inode_operations ceph_dir_iops;
 const struct file_operations ceph_dir_fops;
-struct dentry_operations ceph_dentry_ops;
+const struct dentry_operations ceph_dentry_ops;
 
 /*
  * Initialize ceph dentry state.
@@ -46,7 +47,7 @@ int ceph_init_dentry(struct dentry *dentry)
 	else
 		dentry->d_op = &ceph_snap_dentry_ops;
 
-	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
 	if (!di)
 		return -ENOMEM;          /* oh well */
 
@@ -95,7 +96,6 @@ static unsigned fpos_off(loff_t p)
 static int __dcache_readdir(struct file *filp,
 			    void *dirent, filldir_t filldir)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_file_info *fi = filp->private_data;
 	struct dentry *parent = filp->f_dentry;
 	struct inode *dir = parent->d_inode;
@@ -151,7 +151,6 @@ more:
 
 	atomic_inc(&dentry->d_count);
 	spin_unlock(&dcache_lock);
-	spin_unlock(&inode->i_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -169,35 +168,30 @@ more:
 		} else {
 			dput(last);
 		}
-		last = NULL;
 	}
-
-	spin_lock(&inode->i_lock);
-	spin_lock(&dcache_lock);
-
 	last = dentry;
 
 	if (err < 0)
-		goto out_unlock;
+		goto out;
 
-	p = p->prev;
 	filp->f_pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
-	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
-		goto more;
-	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-	err = -EAGAIN;
+	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock(&dcache_lock);
+	p = p->prev;	/* advance to next dentry */
+	goto more;
 
 out_unlock:
 	spin_unlock(&dcache_lock);
-
-	if (last) {
-		spin_unlock(&inode->i_lock);
+out:
+	if (last)
 		dput(last);
-		spin_lock(&inode->i_lock);
-	}
-
 	return err;
 }
 
@@ -225,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct ceph_file_info *fi = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned frag = fpos_frag(filp->f_pos);
 	int off = fpos_off(filp->f_pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
-	const int max_entries = client->mount_args->max_readdir;
-	const int max_bytes = client->mount_args->max_readdir_bytes;
+	const int max_entries = fsc->mount_options->max_readdir;
+	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -265,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* can we use the dcache? */
 	spin_lock(&inode->i_lock);
 	if ((filp->f_pos == 2 || fi->dentry) &&
-	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		spin_unlock(&inode->i_lock);
 		err = __dcache_readdir(filp, dirent, filldir);
-		if (err != -EAGAIN) {
-			spin_unlock(&inode->i_lock);
+		if (err != -EAGAIN)
 			return err;
-		}
+	} else {
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	if (fi->dentry) {
 		err = note_last_dentry(fi, fi->dentry->d_name.name,
 				       fi->dentry->d_name.len);
@@ -485,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *parent = dentry->d_parent->d_inode;
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
-	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
 	    strcmp(dentry->d_name.name,
-		   client->mount_args->snapdir_name) == 0) {
+		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -537,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 				  struct nameidata *nd)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int op;
 	int err;
@@ -570,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dir->i_lock);
 		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
-			    client->mount_args->snapdir_name,
+			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -627,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 		      int mode, dev_t rdev)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -683,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *dest)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -714,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
 	int op;
@@ -756,8 +749,8 @@ out:
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 		     struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -811,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
  */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct ceph_mds_request *req;
 	int err = -EROFS;
@@ -852,8 +845,8 @@ out:
 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -1019,11 +1012,15 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	struct inode *parent_inode = dentry->d_parent->d_inode;
-	u64 snapid = ceph_snap(parent_inode);
+	struct inode *parent_inode = NULL;
+	u64 snapid = CEPH_NOSNAP;
 
+	if (!IS_ROOT(dentry)) {
+		parent_inode = dentry->d_parent->d_inode;
+		if (parent_inode)
+			snapid = ceph_snap(parent_inode);
+	}
 	dout("dentry_release %p parent %p\n", dentry, parent_inode);
-
 	if (parent_inode && snapid != CEPH_SNAPDIR) {
 		struct ceph_inode_info *ci = ceph_inode(parent_inode);
 
@@ -1070,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 
-	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!cf->dir_info) {
@@ -1171,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
 	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_add_tail(&di->lru, &mdsc->dentry_lru);
 		mdsc->num_dentry++;
@@ -1187,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
 	     dn->d_name.len, dn->d_name.name, di->offset);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_move_tail(&di->lru, &mdsc->dentry_lru);
 		spin_unlock(&mdsc->dentry_lru_lock);
@@ -1202,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
 	     dn->d_name.len, dn->d_name.name);
 	if (di) {
-		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
 		list_del_init(&di->lru);
 		mdsc->num_dentry--;
@@ -1239,16 +1236,16 @@ const struct inode_operations ceph_dir_iops = {
 	.create = ceph_create,
 };
 
-struct dentry_operations ceph_dentry_ops = {
+const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
 	.d_release = ceph_dentry_release,
 };
 
-struct dentry_operations ceph_snapdir_dentry_ops = {
+const struct dentry_operations ceph_snapdir_dentry_ops = {
 	.d_revalidate = ceph_snapdir_d_revalidate,
 	.d_release = ceph_dentry_release,
 };
 
-struct dentry_operations ceph_snap_dentry_ops = {
+const struct dentry_operations ceph_snap_dentry_ops = {
 	.d_release = ceph_dentry_release,
 };
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e..2297d942699 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 /*
  * NFS export support
@@ -42,32 +43,37 @@ struct ceph_nfs_confh {
 static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
 			  int connectable)
 {
+	int type;
 	struct ceph_nfs_fh *fh = (void *)rawfh;
 	struct ceph_nfs_confh *cfh = (void *)rawfh;
 	struct dentry *parent = dentry->d_parent;
 	struct inode *inode = dentry->d_inode;
-	int type;
+	int connected_handle_length = sizeof(*cfh)/4;
+	int handle_length = sizeof(*fh)/4;
 
 	/* don't re-export snaps */
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EINVAL;
 
-	if (*max_len >= sizeof(*cfh)) {
+	if (*max_len >= connected_handle_length) {
 		dout("encode_fh %p connectable\n", dentry);
 		cfh->ino = ceph_ino(dentry->d_inode);
 		cfh->parent_ino = ceph_ino(parent->d_inode);
 		cfh->parent_name_hash = parent->d_name.hash;
-		*max_len = sizeof(*cfh);
+		*max_len = connected_handle_length;
 		type = 2;
-	} else if (*max_len > sizeof(*fh)) {
-		if (connectable)
-			return -ENOSPC;
+	} else if (*max_len >= handle_length) {
+		if (connectable) {
+			*max_len = connected_handle_length;
+			return 255;
+		}
 		dout("encode_fh %p\n", dentry);
 		fh->ino = ceph_ino(dentry->d_inode);
-		*max_len = sizeof(*fh);
+		*max_len = handle_length;
 		type = 1;
 	} else {
-		return -ENOSPC;
+		*max_len = handle_length;
+		return 255;
 	}
 	return type;
 }
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
 				      struct ceph_nfs_confh *cfh)
 {
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
 	struct dentry *dentry;
 	struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7c08698fad3..e77c28cf369 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -38,8 +39,8 @@
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-	struct ceph_client *client = ceph_sb_to_client(sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int want_auth = USE_ANY_MDS;
 	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_open(struct inode *inode, struct file *file)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd, int mode,
 				int locked_dir)
 {
-	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct file *file = nd->intent.open.file;
 	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
 	struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 
 /*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-					    int num_pages,
-					    loff_t off, size_t len)
-{
-	struct page **pages;
-	int rc;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	down_read(&current->mm->mmap_sem);
-	rc = get_user_pages(current, current->mm, (unsigned long)data,
-			    num_pages, 0, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	kfree(pages);
-	return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		put_page(pages[i]);
-	kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-	int i;
-
-	for (i = 0; i < num_pages; i++)
-		__free_pages(pages[i], 0);
-	kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-	struct page **pages;
-	int i;
-
-	pages = kmalloc(sizeof(*pages) * num_pages, flags);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < num_pages; i++) {
-		pages[i] = __page_cache_alloc(flags);
-		if (pages[i] == NULL) {
-			ceph_release_page_vector(pages, i);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-				    const char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, PAGE_CACHE_SIZE-po, left);
-		bad = copy_from_user(page_address(pages[i]) + po, data, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		po += l - bad;
-		if (po == PAGE_CACHE_SIZE) {
-			po = 0;
-			i++;
-		}
-	}
-	return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-				    loff_t off, size_t len)
-{
-	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
-	int left = len;
-	int l, bad;
-
-	while (left > 0) {
-		l = min_t(int, left, PAGE_CACHE_SIZE-po);
-		bad = copy_to_user(data, page_address(pages[i]) + po, l);
-		if (bad == l)
-			return -EFAULT;
-		data += l - bad;
-		left -= l - bad;
-		if (po) {
-			po += l - bad;
-			if (po == PAGE_CACHE_SIZE)
-				po = 0;
-		}
-		i++;
-	}
-	return len;
-}
-
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-	int i = off >> PAGE_CACHE_SHIFT;
-
-	off &= ~PAGE_CACHE_MASK;
-
-	dout("zero_page_vector_page %u~%u\n", off, len);
-
-	/* leading partial page? */
-	if (off) {
-		int end = min((int)PAGE_CACHE_SIZE, off + len);
-		dout("zeroing %d %p head from %d\n", i, pages[i],
-		     (int)off);
-		zero_user_segment(pages[i], off, end);
-		len -= (end - off);
-		i++;
-	}
-	while (len >= PAGE_CACHE_SIZE) {
-		dout("zeroing %d %p len=%d\n", i, pages[i], len);
-		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-		i++;
-	}
-	/* trailing partial page? */
-	if (len) {
-		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-		zero_user_segment(pages[i], 0, len);
-	}
-}
-
-
-/*
  * Read a range of bytes striped over one or more objects.  Iterate over
  * objects we stripe over.  (That's not atomic, but good enough for now.)
  *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
 			struct page **pages, int num_pages,
 			int *checkeof)
 {
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len;
 	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
 
 more:
 	this_len = left;
-	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
 				  ci->i_truncate_seq,
 				  ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
 
 		if (read < pos - off) {
 			dout(" zero gap %llu to %llu\n", off + read, pos);
-			zero_page_vector_range(page_off + read,
-					       pos - off - read, pages);
+			ceph_zero_page_vector_range(page_off + read,
+						    pos - off - read, pages);
 		}
 		pos += ret;
 		read = pos - off;
@@ -495,8 +339,8 @@ more:
 		/* was original extent fully inside i_size? */
 		if (pos + left <= inode->i_size) {
 			dout("zero tail\n");
-			zero_page_vector_range(page_off + read, len - read,
-					       pages);
+			ceph_zero_page_vector_range(page_off + read, len - read,
+						    pages);
 			read = len;
 			goto out;
 		}
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, off, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, off, len);
 
 		/*
 		 * flush any page cache pages in this range.  this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
 
 	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-		ret = copy_page_vector_to_user(pages, data, off, ret);
+		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
 	if (ret >= 0)
 		*poff = off + ret;
 
 done:
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else
 		ceph_release_page_vector(pages, num_pages);
 	dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_osd_request *req;
 	struct page **pages;
 	int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	 */
 more:
 	len = left;
-	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 				    ceph_vino(inode), pos, &len,
 				    CEPH_OSD_OP_WRITE, flags,
 				    ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
 	num_pages = calc_pages_for(pos, len);
 
 	if (file->f_flags & O_DIRECT) {
-		pages = get_direct_page_vector(data, num_pages, pos, len);
+		pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
@@ -665,7 +509,7 @@ more:
 		 * throw out any page cache pages in this range. this
 		 * may block.
 		 */
-		truncate_inode_pages_range(inode->i_mapping, pos, 
+		truncate_inode_pages_range(inode->i_mapping, pos,
 					   (pos+len) | (PAGE_CACHE_SIZE-1));
 	} else {
 		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
@@ -673,7 +517,7 @@ more:
 			ret = PTR_ERR(pages);
 			goto out;
 		}
-		ret = copy_user_to_page_vector(pages, data, pos, len);
+		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
 		if (ret < 0) {
 			ceph_release_page_vector(pages, num_pages);
 			goto out;
@@ -689,7 +533,7 @@ more:
 	req->r_num_pages = num_pages;
 	req->r_inode = inode;
 
-	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		if (req->r_safe_callback) {
 			/*
@@ -697,15 +541,15 @@ more:
 			 * start_request so that a tid has been assigned.
 			 */
 			spin_lock(&ci->i_unsafe_lock);
-			list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+			list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
 			spin_unlock(&ci->i_unsafe_lock);
 			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
 		}
-		ret = ceph_osdc_wait_request(&client->osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	}
 
 	if (file->f_flags & O_DIRECT)
-		put_page_vector(pages, num_pages);
+		ceph_put_page_vector(pages, num_pages);
 	else if (file->f_flags & O_SYNC)
 		ceph_release_page_vector(pages, num_pages);
 
@@ -740,28 +584,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			     unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
+	struct ceph_file_info *fi = filp->private_data;
 	loff_t *ppos = &iocb->ki_pos;
 	size_t len = iov->iov_len;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	void *base = iov->iov_base;
+	void __user *base = iov->iov_base;
 	ssize_t ret;
-	int got = 0;
+	int want, got = 0;
 	int checkeof = 0, read = 0;
 
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
 	__ceph_do_pending_vmtruncate(inode);
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
-			    &got, -1);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_CACHE;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
 	if (ret < 0)
 		goto out;
 	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)len,
 	     ceph_cap_string(got));
 
-	if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
 	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
 		/* hmm, this isn't really async... */
@@ -807,11 +655,13 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		       unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	loff_t endoff = pos + iov->iov_len;
-	int got = 0;
+	int want, got = 0;
 	int ret, err;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +674,11 @@ retry_snap:
 	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
 	     inode->i_size);
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
-			    &got, endoff);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
 	if (ret < 0)
 		goto out;
 
@@ -833,7 +686,7 @@ retry_snap:
 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
 	     ceph_cap_string(got));
 
-	if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
 	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
 		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -930,6 +783,8 @@ const struct file_operations ceph_file_fops = {
 	.aio_write = ceph_aio_write,
 	.mmap = ceph_mmap,
 	.fsync = ceph_fsync,
+	.lock = ceph_lock,
+	.flock = ceph_flock,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
 	.unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 389f9dbd994..1d6a45b5a04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -13,7 +13,8 @@
 #include <linux/pagevec.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 
 /*
  * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
 
 		dout(" dropping residual ref to snap realm %p\n", realm);
@@ -442,8 +443,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			 * the file is either opened or mmaped
 			 */
 			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
-				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
-				      CEPH_CAP_FILE_EXCL)) ||
+				       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+				       CEPH_CAP_FILE_EXCL|
+				       CEPH_CAP_FILE_LAZYIO)) ||
 			    mapping_mapped(inode->i_mapping) ||
 			    __ceph_caps_file_wanted(ci)) {
 				ci->i_truncate_pending++;
@@ -676,6 +678,7 @@ static int fill_inode(struct inode *inode,
 		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
 		    ceph_snap(inode) == CEPH_NOSNAP &&
 		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -683,7 +686,7 @@ static int fill_inode(struct inode *inode,
 		}
 
 		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
 			inode->i_size = ci->i_rbytes;
 		break;
 	default:
@@ -843,7 +846,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
  * the caller) if we fail.
  */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-				    bool *prehash)
+				    bool *prehash, bool set_offset)
 {
 	struct dentry *realdn;
 
@@ -875,7 +878,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	}
 	if ((!prehash || *prehash) && d_unhashed(dn))
 		d_rehash(dn);
-	ceph_set_dentry_offset(dn);
+	if (set_offset)
+		ceph_set_dentry_offset(dn);
 out:
 	return dn;
 }
@@ -898,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct inode *in = NULL;
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
-	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int i = 0;
 	int err = 0;
 
@@ -962,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	 */
 	if (rinfo->head->is_dentry && !req->r_aborted &&
 	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-					       client->mount_args->snapdir_name,
+					       fsc->mount_options->snapdir_name,
 					       req->r_dentry->d_name.len))) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
@@ -1060,7 +1064,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				d_delete(dn);
 				goto done;
 			}
-			dn = splice_dentry(dn, in, &have_lease);
+			dn = splice_dentry(dn, in, &have_lease, true);
 			if (IS_ERR(dn)) {
 				err = PTR_ERR(dn);
 				goto done;
@@ -1103,7 +1107,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			goto done;
 		}
 		dout(" linking snapped dir %p to dn %p\n", in, dn);
-		dn = splice_dentry(dn, in, NULL);
+		dn = splice_dentry(dn, in, NULL, true);
 		if (IS_ERR(dn)) {
 			err = PTR_ERR(dn);
 			goto done;
@@ -1228,14 +1232,14 @@ retry_lookup:
 			in = dn->d_inode;
 		} else {
 			in = ceph_get_inode(parent->d_sb, vino);
-			if (in == NULL) {
+			if (IS_ERR(in)) {
 				dout("new_inode badness\n");
 				d_delete(dn);
 				dput(dn);
-				err = -ENOMEM;
+				err = PTR_ERR(in);
 				goto out;
 			}
-			dn = splice_dentry(dn, in, NULL);
+			dn = splice_dentry(dn, in, NULL, false);
 			if (IS_ERR(dn))
 				dn = NULL;
 		}
@@ -1530,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
@@ -1725,8 +1729,8 @@ out:
  */
 int ceph_do_getattr(struct inode *inode, int mask)
 {
-	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
 
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index d085f07756b..8888c9ba68d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
 #include <linux/in.h>
 
-#include "ioctl.h"
 #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
 
 
 /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 }
 
 /*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+				       USE_AUTH_MDS);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+			cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+			cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+			cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool =
+			cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+			cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
  * Return object name, size/offset information, and location (OSD
  * number, network address) for a given file offset.
  */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct ceph_ioctl_dataloc dl;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	struct ceph_osd_client *osdc =
+		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
@@ -143,6 +208,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	return 0;
 }
 
+static long ceph_ioctl_lazyio(struct file *file)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+		spin_lock(&inode->i_lock);
+		ci->i_nr_by_mode[fi->fmode]--;
+		fi->fmode |= CEPH_FILE_MODE_LAZY;
+		ci->i_nr_by_mode[fi->fmode]++;
+		spin_unlock(&inode->i_lock);
+		dout("ioctl_layzio: file %p marked lazy\n", file);
+
+		ceph_check_caps(ci, 0, NULL);
+	} else {
+		dout("ioctl_layzio: file %p already lazy\n", file);
+	}
+	return 0;
+}
+
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -153,8 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case CEPH_IOC_SET_LAYOUT:
 		return ceph_ioctl_set_layout(file, (void __user *)arg);
 
+	case CEPH_IOC_SET_LAYOUT_POLICY:
+		return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
 	case CEPH_IOC_GET_DATALOC:
 		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+
+	case CEPH_IOC_LAZYIO:
+		return ceph_ioctl_lazyio(file);
 	}
+
 	return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d05..a6ce54e94eb 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
 
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
 				   struct ceph_ioctl_layout)
 #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
 				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,	\
+				   struct ceph_ioctl_layout)
 
 /*
  * Extract identity, address of the OSD and object storing a given
@@ -37,4 +39,6 @@ struct ceph_ioctl_dataloc {
 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
 				   struct ceph_ioctl_dataloc)
 
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+
 #endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 00000000000..40abde93c34
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,273 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/file.h>
+#include <linux/namei.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include <linux/ceph/pagelist.h>
+
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+			     u64 pid, u64 pid_ns,
+			     int cmd, u64 start, u64 length, u8 wait)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_client *mdsc =
+		ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+
+	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+	     "length: %llu, wait: %d, type`: %d", (int)lock_type,
+	     (int)operation, pid, start, length, wait, cmd);
+
+	req->r_args.filelock_change.rule = lock_type;
+	req->r_args.filelock_change.type = cmd;
+	req->r_args.filelock_change.pid = cpu_to_le64(pid);
+	/* This should be adjusted, but I'm not sure if
+	   namespaces actually get id numbers*/
+	req->r_args.filelock_change.pid_namespace =
+		cpu_to_le64((u64)pid_ns);
+	req->r_args.filelock_change.start = cpu_to_le64(start);
+	req->r_args.filelock_change.length = cpu_to_le64(length);
+	req->r_args.filelock_change.wait = wait;
+
+	err = ceph_mdsc_do_request(mdsc, inode, req);
+	ceph_mdsc_put_request(req);
+	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+	     "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+	     (int)operation, pid, start, length, wait, cmd, err);
+	return err;
+}
+
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u64 length;
+	u8 lock_cmd;
+	int err;
+	u8 wait = 0;
+	u16 op = CEPH_MDS_OP_SETFILELOCK;
+
+	fl->fl_nspid = get_pid(task_tgid(current));
+	dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+
+	/* set wait bit as appropriate, then make command as Ceph expects it*/
+	if (F_SETLKW == cmd)
+		wait = 1;
+	if (F_GETLK == cmd)
+		op = CEPH_MDS_OP_GETFILELOCK;
+
+	if (F_RDLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (F_WRLCK == fl->fl_type)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+
+	if (LLONG_MAX == fl->fl_end)
+		length = 0;
+	else
+		length = fl->fl_end - fl->fl_start + 1;
+
+	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+				(u64)fl->fl_pid,
+				(u64)(unsigned long)fl->fl_nspid,
+				lock_cmd, fl->fl_start,
+				length, wait);
+	if (!err) {
+		dout("mds locked, locking locally");
+		err = posix_lock_file(file, fl, NULL);
+		if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+			/* undo! This should only happen if the kernel detects
+			 * local deadlock. */
+			ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+					  (u64)fl->fl_pid,
+					  (u64)(unsigned long)fl->fl_nspid,
+					  CEPH_LOCK_UNLOCK, fl->fl_start,
+					  length, 0);
+			dout("got %d on posix_lock_file, undid lock", err);
+		}
+	} else {
+		dout("mds returned error code %d", err);
+	}
+	return err;
+}
+
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	u64 length;
+	u8 lock_cmd;
+	int err;
+	u8 wait = 1;
+
+	fl->fl_nspid = get_pid(task_tgid(current));
+	dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+
+	/* set wait bit, then clear it out of cmd*/
+	if (cmd & LOCK_NB)
+		wait = 0;
+	cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
+	/* set command sequence that Ceph wants to see:
+	   shared lock, exclusive lock, or unlock */
+	if (LOCK_SH == cmd)
+		lock_cmd = CEPH_LOCK_SHARED;
+	else if (LOCK_EX == cmd)
+		lock_cmd = CEPH_LOCK_EXCL;
+	else
+		lock_cmd = CEPH_LOCK_UNLOCK;
+	/* mds requires start and length rather than start and end */
+	if (LLONG_MAX == fl->fl_end)
+		length = 0;
+	else
+		length = fl->fl_end - fl->fl_start + 1;
+
+	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+				file, (u64)fl->fl_pid,
+				(u64)(unsigned long)fl->fl_nspid,
+				lock_cmd, fl->fl_start,
+				length, wait);
+	if (!err) {
+		err = flock_lock_file_wait(file, fl);
+		if (err) {
+			ceph_lock_message(CEPH_LOCK_FLOCK,
+					  CEPH_MDS_OP_SETFILELOCK,
+					  file, (u64)fl->fl_pid,
+					  (u64)(unsigned long)fl->fl_nspid,
+					  CEPH_LOCK_UNLOCK, fl->fl_start,
+					  length, 0);
+			dout("got %d on flock_lock_file_wait, undid lock", err);
+		}
+	} else {
+		dout("mds error code %d", err);
+	}
+	return err;
+}
+
+/**
+ * Must be called with BKL already held. Fills in the passed
+ * counter variables, so you can prepare pagelist metadata before calling
+ * ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+	struct file_lock *lock;
+
+	*fcntl_count = 0;
+	*flock_count = 0;
+
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_POSIX)
+			++(*fcntl_count);
+		else if (lock->fl_flags & FL_FLOCK)
+			++(*flock_count);
+	}
+	dout("counted %d flock locks and %d fcntl locks",
+	     *flock_count, *fcntl_count);
+}
+
+/**
+ * Encode the flock and fcntl locks for the given inode into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
+ */
+int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
+		      int num_fcntl_locks, int num_flock_locks)
+{
+	struct file_lock *lock;
+	struct ceph_filelock cephlock;
+	int err = 0;
+	int seen_fcntl = 0;
+	int seen_flock = 0;
+
+	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+	     num_fcntl_locks);
+	err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
+	if (err)
+		goto fail;
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_POSIX) {
+			++seen_fcntl;
+			if (seen_fcntl > num_fcntl_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			err = lock_to_ceph_filelock(lock, &cephlock);
+			if (err)
+				goto fail;
+			err = ceph_pagelist_append(pagelist, &cephlock,
+					   sizeof(struct ceph_filelock));
+		}
+		if (err)
+			goto fail;
+	}
+
+	err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
+	if (err)
+		goto fail;
+	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+		if (lock->fl_flags & FL_FLOCK) {
+			++seen_flock;
+			if (seen_flock > num_flock_locks) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			err = lock_to_ceph_filelock(lock, &cephlock);
+			if (err)
+				goto fail;
+			err = ceph_pagelist_append(pagelist, &cephlock,
+					   sizeof(struct ceph_filelock));
+		}
+		if (err)
+			goto fail;
+	}
+fail:
+	return err;
+}
+
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+int lock_to_ceph_filelock(struct file_lock *lock,
+			  struct ceph_filelock *cephlock)
+{
+	int err = 0;
+
+	cephlock->start = cpu_to_le64(lock->fl_start);
+	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+	cephlock->client = cpu_to_le64(0);
+	cephlock->pid = cpu_to_le64(lock->fl_pid);
+	cephlock->pid_namespace =
+	        cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+
+	switch (lock->fl_type) {
+	case F_RDLCK:
+		cephlock->type = CEPH_LOCK_SHARED;
+		break;
+	case F_WRLCK:
+		cephlock->type = CEPH_LOCK_EXCL;
+		break;
+	case F_UNLCK:
+		cephlock->type = CEPH_LOCK_UNLOCK;
+		break;
+	default:
+		dout("Have unknown lock type %d", lock->fl_type);
+		err = -EINVAL;
+	}
+
+	return err;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dd440bd438a..3142b15940c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,16 +1,21 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
+#include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 
-#include "mds_client.h"
-#include "mon_client.h"
 #include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
@@ -37,6 +42,11 @@
  * are no longer valid.
  */
 
+struct ceph_reconnect_state {
+	struct ceph_pagelist *pagelist;
+	bool flock;
+};
+
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head);
 
@@ -280,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_authorizer)
-			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
-				s->s_mdsc->client->monc.auth, s->s_authorizer);
+		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+			     s->s_mdsc->fsc->client->monc.auth,
+			     s->s_authorizer);
 		kfree(s);
 	}
 }
@@ -338,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 
-	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
 	s->s_con.private = s;
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -449,7 +460,7 @@ void ceph_mdsc_release_request(struct kref *kref)
 	kfree(req->r_path1);
 	kfree(req->r_path2);
 	put_request_session(req);
-	ceph_unreserve_caps(&req->r_caps_reservation);
+	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
 	kfree(req);
 }
 
@@ -512,7 +523,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
 {
 	req->r_tid = ++mdsc->last_tid;
 	if (req->r_num_caps)
-		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+		ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+				  req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
 	__insert_request(mdsc, req);
@@ -553,6 +565,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		dentry = dentry->d_parent;
+	return dentry;
+}
+
 static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
@@ -583,14 +602,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	if (req->r_inode) {
 		inode = req->r_inode;
 	} else if (req->r_dentry) {
-		if (req->r_dentry->d_inode) {
+		struct inode *dir = req->r_dentry->d_parent->d_inode;
+
+		if (dir->i_sb != mdsc->fsc->sb) {
+			/* not this fs! */
+			inode = req->r_dentry->d_inode;
+		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
+			/* direct snapped/virtual snapdir requests
+			 * based on parent dir inode */
+			struct dentry *dn =
+				get_nonsnap_parent(req->r_dentry->d_parent);
+			inode = dn->d_inode;
+			dout("__choose_mds using nonsnap parent %p\n", inode);
+		} else if (req->r_dentry->d_inode) {
+			/* dentry target */
 			inode = req->r_dentry->d_inode;
 		} else {
-			inode = req->r_dentry->d_parent->d_inode;
+			/* dir + name */
+			inode = dir;
 			hash = req->r_dentry->d_name.hash;
 			is_hash = true;
 		}
 	}
+
 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
 	     (int)hash, mode);
 	if (!inode)
@@ -704,6 +738,51 @@ static int __open_session(struct ceph_mds_client *mdsc,
 }
 
 /*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session)
+{
+	struct ceph_mds_info *mi;
+	struct ceph_mds_session *ts;
+	int i, mds = session->s_mds;
+	int target;
+
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return;
+	mi = &mdsc->mdsmap->m_info[mds];
+	dout("open_export_target_sessions for mds%d (%d targets)\n",
+	     session->s_mds, mi->num_export_targets);
+
+	for (i = 0; i < mi->num_export_targets; i++) {
+		target = mi->export_targets[i];
+		ts = __ceph_lookup_mds_session(mdsc, target);
+		if (!ts) {
+			ts = register_session(mdsc, target);
+			if (IS_ERR(ts))
+				return;
+		}
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		else
+			dout(" mds%d target mds%d %p is %s\n", session->s_mds,
+			     i, ts, session_state_name(ts->s_state));
+		ceph_put_mds_session(ts);
+	}
+}
+
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					   struct ceph_mds_session *session)
+{
+	mutex_lock(&mdsc->mutex);
+	__open_export_target_sessions(mdsc, session);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
  * session caps
  */
 
@@ -764,7 +843,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 			last_inode = NULL;
 		}
 		if (old_cap) {
-			ceph_put_cap(old_cap);
+			ceph_put_cap(session->s_mdsc, old_cap);
 			old_cap = NULL;
 		}
 
@@ -793,7 +872,7 @@ out:
 	if (last_inode)
 		iput(last_inode);
 	if (old_cap)
-		ceph_put_cap(old_cap);
+		ceph_put_cap(session->s_mdsc, old_cap);
 
 	return ret;
 }
@@ -810,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	__ceph_remove_cap(cap);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
-			&ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_client(inode->i_sb)->mdsc;
 
 		spin_lock(&mdsc->cap_dirty_lock);
 		if (!list_empty(&ci->i_dirty_item)) {
@@ -1067,15 +1146,16 @@ static int trim_caps(struct ceph_mds_client *mdsc,
  * Called under s_mutex.
  */
 int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-			  struct ceph_mds_session *session,
-			  int extra)
+			  struct ceph_mds_session *session)
 {
-	struct ceph_msg *msg;
+	struct ceph_msg *msg, *partial = NULL;
 	struct ceph_mds_cap_release *head;
 	int err = -ENOMEM;
+	int extra = mdsc->fsc->mount_options->cap_release_safety;
+	int num;
 
-	if (extra < 0)
-		extra = mdsc->client->mount_args->cap_release_safety;
+	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
+	     extra);
 
 	spin_lock(&session->s_cap_lock);
 
@@ -1084,9 +1164,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 				       struct ceph_msg,
 				 list_head);
 		head = msg->front.iov_base;
-		extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+		num = le32_to_cpu(head->num);
+		if (num) {
+			dout(" partial %p with (%d/%d)\n", msg, num,
+			     (int)CEPH_CAPS_PER_RELEASE);
+			extra += CEPH_CAPS_PER_RELEASE - num;
+			partial = msg;
+		}
 	}
-
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
 		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
@@ -1103,19 +1188,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
 	}
 
-	if (!list_empty(&session->s_cap_releases)) {
-		msg = list_first_entry(&session->s_cap_releases,
-				       struct ceph_msg,
-				       list_head);
-		head = msg->front.iov_base;
-		if (head->num) {
-			dout(" queueing non-full %p (%d)\n", msg,
-			     le32_to_cpu(head->num));
-			list_move_tail(&msg->list_head,
-				      &session->s_cap_releases_done);
-			session->s_num_cap_releases -=
-				CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
-		}
+	if (partial) {
+		head = partial->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout(" queueing partial %p with %d/%d\n", partial, num,
+		     (int)CEPH_CAPS_PER_RELEASE);
+		list_move_tail(&partial->list_head,
+			       &session->s_cap_releases_done);
+		session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
 	}
 	err = 0;
 	spin_unlock(&session->s_cap_lock);
@@ -1250,6 +1330,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 		return ERR_PTR(-ENOMEM);
 
 	mutex_init(&req->r_fill_mutex);
+	req->r_mdsc = mdsc;
 	req->r_started = jiffies;
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1580,6 +1661,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
 	req->r_mds = mds;
 	req->r_attempts++;
+	if (req->r_inode) {
+		struct ceph_cap *cap =
+			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+
+		if (cap)
+			req->r_sent_on_mseq = cap->mseq;
+		else
+			req->r_sent_on_mseq = -1;
+	}
 	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
@@ -1914,21 +2004,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	result = le32_to_cpu(head->result);
 
 	/*
-	 * Tolerate 2 consecutive ESTALEs from the same mds.
-	 * FIXME: we should be looking at the cap migrate_seq.
+	 * Handle an ESTALE
+	 * if we're not talking to the authority, send to them
+	 * if the authority has changed while we weren't looking,
+	 * send to new authority
+	 * Otherwise we just have to return an ESTALE
 	 */
 	if (result == -ESTALE) {
-		req->r_direct_mode = USE_AUTH_MDS;
-		req->r_num_stale++;
-		if (req->r_num_stale <= 2) {
+		dout("got ESTALE on request %llu", req->r_tid);
+		if (!req->r_inode) {
+			/* do nothing; not an authority problem */
+		} else if (req->r_direct_mode != USE_AUTH_MDS) {
+			dout("not using auth, setting for that now");
+			req->r_direct_mode = USE_AUTH_MDS;
 			__do_request(mdsc, req);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
+		} else  {
+			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+			struct ceph_cap *cap =
+				ceph_get_cap_for_mds(ci, req->r_mds);;
+
+			dout("already using auth");
+			if ((!cap || cap != ci->i_auth_cap) ||
+			    (cap->mseq != req->r_sent_on_mseq)) {
+				dout("but cap changed, so resending");
+				__do_request(mdsc, req);
+				mutex_unlock(&mdsc->mutex);
+				goto out;
+			}
 		}
-	} else {
-		req->r_num_stale = 0;
+		dout("have to return ESTALE on request %llu", req->r_tid);
 	}
 
+
 	if (head->safe) {
 		req->r_got_safe = true;
 		__unregister_request(mdsc, req);
@@ -1981,11 +2090,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	/* insert trace into our cache */
 	mutex_lock(&req->r_fill_mutex);
-	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
-		ceph_unreserve_caps(&req->r_caps_reservation);
+		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 	mutex_unlock(&req->r_fill_mutex);
 
@@ -2005,7 +2114,7 @@ out_err:
 	}
 	mutex_unlock(&mdsc->mutex);
 
-	ceph_add_cap_releases(mdsc, req->r_session, -1);
+	ceph_add_cap_releases(mdsc, req->r_session);
 	mutex_unlock(&session->s_mutex);
 
 	/* kick calling process */
@@ -2126,7 +2235,7 @@ static void handle_session(struct ceph_mds_session *session,
 			pr_info("mds%d reconnect denied\n", session->s_mds);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
-		complete_all(&mdsc->session_close_waiters);
+		wake_up_all(&mdsc->session_close_wq);
 		kick_requests(mdsc, mds);
 		break;
 
@@ -2193,9 +2302,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			  void *arg)
 {
-	struct ceph_mds_cap_reconnect rec;
+	union {
+		struct ceph_mds_cap_reconnect v2;
+		struct ceph_mds_cap_reconnect_v1 v1;
+	} rec;
+	size_t reclen;
 	struct ceph_inode_info *ci;
-	struct ceph_pagelist *pagelist = arg;
+	struct ceph_reconnect_state *recon_state = arg;
+	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	char *path;
 	int pathlen, err;
 	u64 pathbase;
@@ -2215,7 +2329,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
-			BUG_ON(err);
+			goto out_dput;
 		}
 	} else {
 		path = NULL;
@@ -2223,25 +2337,71 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	}
 	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
 	if (err)
-		goto out;
+		goto out_free;
 
 	spin_lock(&inode->i_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
-	rec.cap_id = cpu_to_le64(cap->cap_id);
-	rec.pathbase = cpu_to_le64(pathbase);
-	rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
-	rec.issued = cpu_to_le32(cap->issued);
-	rec.size = cpu_to_le64(inode->i_size);
-	ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
-	ceph_encode_timespec(&rec.atime, &inode->i_atime);
-	rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+
+	if (recon_state->flock) {
+		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v2.issued = cpu_to_le32(cap->issued);
+		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v2.pathbase = cpu_to_le64(pathbase);
+		rec.v2.flock_len = 0;
+		reclen = sizeof(rec.v2);
+	} else {
+		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+		rec.v1.issued = cpu_to_le32(cap->issued);
+		rec.v1.size = cpu_to_le64(inode->i_size);
+		ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
+		ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+		rec.v1.pathbase = cpu_to_le64(pathbase);
+		reclen = sizeof(rec.v1);
+	}
 	spin_unlock(&inode->i_lock);
 
-	err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+	if (recon_state->flock) {
+		int num_fcntl_locks, num_flock_locks;
+		struct ceph_pagelist_cursor trunc_point;
+
+		ceph_pagelist_set_cursor(pagelist, &trunc_point);
+		do {
+			lock_flocks();
+			ceph_count_locks(inode, &num_fcntl_locks,
+					 &num_flock_locks);
+			rec.v2.flock_len = (2*sizeof(u32) +
+					    (num_fcntl_locks+num_flock_locks) *
+					    sizeof(struct ceph_filelock));
+			unlock_flocks();
+
+			/* pre-alloc pagelist */
+			ceph_pagelist_truncate(pagelist, &trunc_point);
+			err = ceph_pagelist_append(pagelist, &rec, reclen);
+			if (!err)
+				err = ceph_pagelist_reserve(pagelist,
+							    rec.v2.flock_len);
+
+			/* encode locks */
+			if (!err) {
+				lock_flocks();
+				err = ceph_encode_locks(inode,
+							pagelist,
+							num_fcntl_locks,
+							num_flock_locks);
+				unlock_flocks();
+			}
+		} while (err == -ENOSPC);
+	} else {
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+	}
 
-out:
+out_free:
 	kfree(path);
+out_dput:
 	dput(dentry);
 	return err;
 }
@@ -2267,6 +2427,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	int mds = session->s_mds;
 	int err = -ENOMEM;
 	struct ceph_pagelist *pagelist;
+	struct ceph_reconnect_state recon_state;
 
 	pr_info("mds%d reconnect start\n", mds);
 
@@ -2301,7 +2462,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
 	if (err)
 		goto fail;
-	err = iterate_session_caps(session, encode_caps_cb, pagelist);
+
+	recon_state.pagelist = pagelist;
+	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
 	if (err < 0)
 		goto fail;
 
@@ -2326,6 +2490,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	}
 
 	reply->pagelist = pagelist;
+	if (recon_state.flock)
+		reply->hdr.version = cpu_to_le16(2);
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
@@ -2376,9 +2542,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		oldstate = ceph_mdsmap_get_state(oldmap, i);
 		newstate = ceph_mdsmap_get_state(newmap, i);
 
-		dout("check_new_map mds%d state %s -> %s (session %s)\n",
+		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
 		     i, ceph_mds_state_name(oldstate),
+		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
 		     ceph_mds_state_name(newstate),
+		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     session_state_name(s->s_state));
 
 		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2428,6 +2596,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			wake_up_session_caps(s, 1);
 		}
 	}
+
+	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		s = mdsc->sessions[i];
+		if (!s)
+			continue;
+		if (!ceph_mdsmap_is_laggy(newmap, i))
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG ||
+		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout(" connecting to export targets of laggy mds%d\n",
+			     i);
+			__open_export_target_sessions(mdsc, s);
+		}
+	}
 }
 
 
@@ -2451,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session,
 			 struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct dentry *parent, *dentry;
@@ -2715,7 +2898,7 @@ static void delayed_work(struct work_struct *work)
 			send_renew_caps(mdsc, s);
 		else
 			ceph_con_keepalive(&s->s_con);
-		ceph_add_cap_releases(mdsc, s, -1);
+		ceph_add_cap_releases(mdsc, s);
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG)
 			ceph_send_cap_releases(mdsc, s);
@@ -2729,17 +2912,23 @@ static void delayed_work(struct work_struct *work)
 	schedule_delayed(mdsc);
 }
 
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
 
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
-	mdsc->client = client;
+	struct ceph_mds_client *mdsc;
+
+	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+	if (!mdsc)
+		return -ENOMEM;
+	mdsc->fsc = fsc;
+	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
 	if (mdsc->mdsmap == NULL)
 		return -ENOMEM;
 
 	init_completion(&mdsc->safe_umount_waiters);
-	init_completion(&mdsc->session_close_waiters);
+	init_waitqueue_head(&mdsc->session_close_wq);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
 	mdsc->sessions = NULL;
 	mdsc->max_sessions = 0;
@@ -2764,6 +2953,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	spin_lock_init(&mdsc->dentry_lru_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
 
+	ceph_caps_init(mdsc);
+	ceph_adjust_min_caps(mdsc, fsc->min_caps);
+
 	return 0;
 }
 
@@ -2774,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_request *req;
-	struct ceph_client *client = mdsc->client;
+	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -2782,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    client->mount_args->mount_timeout * HZ);
+				    fsc->client->options->mount_timeout * HZ);
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -2865,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
@@ -2881,6 +3073,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
 
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+	int i, n = 0;
+
+	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return true;
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++)
+		if (mdsc->sessions[i])
+			n++;
+	mutex_unlock(&mdsc->mutex);
+	return n == 0;
+}
 
 /*
  * called after sb is ro.
@@ -2889,45 +3098,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_session *session;
 	int i;
-	int n;
-	struct ceph_client *client = mdsc->client;
-	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+	struct ceph_fs_client *fsc = mdsc->fsc;
+	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
-	mutex_lock(&mdsc->mutex);
-
 	/* close sessions */
-	started = jiffies;
-	while (time_before(jiffies, started + timeout)) {
-		dout("closing sessions\n");
-		n = 0;
-		for (i = 0; i < mdsc->max_sessions; i++) {
-			session = __ceph_lookup_mds_session(mdsc, i);
-			if (!session)
-				continue;
-			mutex_unlock(&mdsc->mutex);
-			mutex_lock(&session->s_mutex);
-			__close_session(mdsc, session);
-			mutex_unlock(&session->s_mutex);
-			ceph_put_mds_session(session);
-			mutex_lock(&mdsc->mutex);
-			n++;
-		}
-		if (n == 0)
-			break;
-
-		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
-			break;
-
-		dout("waiting for sessions to close\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		session = __ceph_lookup_mds_session(mdsc, i);
+		if (!session)
+			continue;
 		mutex_unlock(&mdsc->mutex);
-		wait_for_completion_timeout(&mdsc->session_close_waiters,
-					    timeout);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
 		mutex_lock(&mdsc->mutex);
 	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("waiting for sessions to close\n");
+	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+			   timeout);
 
 	/* tear down remaining sessions */
+	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i]) {
 			session = get_session(mdsc->sessions[i]);
@@ -2940,9 +3136,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 			mutex_lock(&mdsc->mutex);
 		}
 	}
-
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
-
 	mutex_unlock(&mdsc->mutex);
 
 	ceph_cleanup_empty_realms(mdsc);
@@ -2952,13 +3146,23 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 	if (mdsc->mdsmap)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
+	ceph_caps_finalize(mdsc);
+}
+
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+
+	ceph_mdsc_stop(mdsc);
+	fsc->mdsc = NULL;
+	kfree(mdsc);
 }
 
 
@@ -2977,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
 		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 
 	/* do we need it? */
-	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
 		dout("handle_map epoch %u <= our %u\n",
@@ -3008,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	} else {
 		mdsc->mdsmap = newmap;  /* first mds map */
 	}
-	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
 
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 
@@ -3109,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	int ret = 0;
 
 	if (force_new && s->s_authorizer) {
@@ -3143,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
@@ -3152,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
-	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
 	if (ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
-	return ceph_monc_validate_auth(&mdsc->client->monc);
+	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
 static const struct ceph_connection_operations mds_con_ops = {
@@ -3170,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.peer_reset = peer_reset,
 };
 
-
-
-
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 952410c60d0..d66d63c7235 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
 
 /*
  * Some lock dependencies:
@@ -26,7 +26,7 @@
  *
  */
 
-struct ceph_client;
+struct ceph_fs_client;
 struct ceph_cap;
 
 /*
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 struct ceph_mds_request {
 	u64 r_tid;                   /* transaction id */
 	struct rb_node r_node;
+	struct ceph_mds_client *r_mdsc;
 
 	int r_op;                    /* mds op code */
 	int r_mds;
@@ -207,8 +208,8 @@ struct ceph_mds_request {
 
 	int               r_attempts;   /* resend attempts */
 	int               r_num_fwd;    /* number of forward attempts */
-	int               r_num_stale;
 	int               r_resend_mds; /* mds to resend to next, if any*/
+	u32               r_sent_on_mseq; /* cap mseq request was sent at*/
 
 	struct kref       r_kref;
 	struct list_head  r_wait;
@@ -229,11 +230,12 @@ struct ceph_mds_request {
  * mds client state
  */
 struct ceph_mds_client {
-	struct ceph_client      *client;
+	struct ceph_fs_client  *fsc;
 	struct mutex            mutex;         /* all nested structures */
 
 	struct ceph_mdsmap      *mdsmap;
-	struct completion       safe_umount_waiters, session_close_waiters;
+	struct completion       safe_umount_waiters;
+	wait_queue_head_t       session_close_wq;
 	struct list_head        waiting_for_map;
 
 	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
@@ -267,10 +269,26 @@ struct ceph_mds_client {
 	spinlock_t        cap_dirty_lock;   /* protects above items */
 	wait_queue_head_t cap_flushing_wq;
 
-#ifdef CONFIG_DEBUG_FS
-	struct dentry 	  *debugfs_file;
-#endif
-
+	/*
+	 * Cap reservations
+	 *
+	 * Maintain a global pool of preallocated struct ceph_caps, referenced
+	 * by struct ceph_caps_reservations.  This ensures that we preallocate
+	 * memory needed to successfully process an MDS response.  (If an MDS
+	 * sends us cap information and we fail to process it, we will have
+	 * problems due to the client and MDS being out of sync.)
+	 *
+	 * Reservations are 'owned' by a ceph_cap_reservation context.
+	 */
+	spinlock_t	caps_list_lock;
+	struct		list_head caps_list; /* unused (reserved or
+						unreserved) */
+	int		caps_total_count;    /* total caps allocated */
+	int		caps_use_count;      /* in use */
+	int		caps_reserve_count;  /* unused, reserved */
+	int		caps_avail_count;    /* unused, unreserved */
+	int		caps_min_count;      /* keep at least this many
+						(unreserved) */
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
 	int		  num_dentry;
@@ -293,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 			     struct ceph_msg *msg, int mds);
 
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
-			   struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
@@ -324,8 +341,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 }
 
 extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-				 struct ceph_mds_session *session,
-				 int extra);
+				 struct ceph_mds_session *session);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session);
 
@@ -343,4 +359,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
 				 struct ceph_msg *msg);
 
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session);
+
 #endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfe..73b7d44e8a3 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/bug.h>
 #include <linux/err.h>
@@ -6,9 +6,9 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
 
 #include "super.h"
 
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		struct ceph_entity_addr addr;
 		u32 num_export_targets;
 		void *pexport_targets = NULL;
+		struct ceph_timespec laggy_since;
 
 		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
 		global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		state_seq = ceph_decode_64(p);
 		ceph_decode_copy(p, &addr, sizeof(addr));
 		ceph_decode_addr(&addr);
-		*p += sizeof(struct ceph_timespec);
+		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
 		*p += namelen;
@@ -116,12 +117,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		}
 
 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+		     i+1, n, global_id, mds, inc,
+		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
 			m->m_info[mds].global_id = global_id;
 			m->m_info[mds].state = state;
 			m->m_info[mds].addr = addr;
+			m->m_info[mds].laggy =
+				(laggy_since.tv_sec != 0 ||
+				 laggy_since.tv_nsec != 0);
 			m->m_info[mds].num_export_targets = num_export_targets;
 			if (num_export_targets) {
 				m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index eacc131aa5c..00000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include "types.h"
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-	u64 global_id;
-	struct ceph_entity_addr addr;
-	s32 state;
-	int num_export_targets;
-	u32 *export_targets;
-};
-
-struct ceph_mdsmap {
-	u32 m_epoch, m_client_epoch, m_last_failure;
-	u32 m_root;
-	u32 m_session_timeout;          /* seconds */
-	u32 m_session_autoclose;        /* seconds */
-	u64 m_max_file_size;
-	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-	struct ceph_mds_info *m_info;
-
-	/* which object pools file data can be stored in */
-	int m_num_data_pg_pools;
-	u32 *m_data_pg_pools;
-	u32 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-	if (w >= m->m_max_mds)
-		return NULL;
-	return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-	BUG_ON(w < 0);
-	if (w >= m->m_max_mds)
-		return CEPH_MDS_STATE_DNE;
-	return m->m_info[w].state;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 15167b2daa5..00000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2276 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system.  The messenger provides ordered and reliable
- * delivery.  We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error).  Acks allow sent messages to be discarded by
- * the sender.
- */
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-
-
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
-	int i;
-	char *s;
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
-
-	spin_lock(&addr_str_lock);
-	i = last_addr_str++;
-	if (last_addr_str == MAX_ADDR_STR)
-		last_addr_str = 0;
-	spin_unlock(&addr_str_lock);
-	s = addr_str[i];
-
-	switch (ss->ss_family) {
-	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-			 (unsigned int)ntohs(in4->sin_port));
-		break;
-
-	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-			 (unsigned int)ntohs(in6->sin6_port));
-		break;
-
-	default:
-		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
-	}
-
-	return s;
-}
-
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_addr(&msgr->my_enc_addr);
-}
-
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-
-int __init ceph_msgr_init(void)
-{
-	ceph_msgr_wq = create_workqueue("ceph-msgr");
-	if (IS_ERR(ceph_msgr_wq)) {
-		int ret = PTR_ERR(ceph_msgr_wq);
-		pr_err("msgr_init failed to create workqueue: %d\n", ret);
-		ceph_msgr_wq = NULL;
-		return ret;
-	}
-	return 0;
-}
-
-void ceph_msgr_exit(void)
-{
-	destroy_workqueue(ceph_msgr_wq);
-}
-
-void ceph_msgr_flush()
-{
-	flush_workqueue(ceph_msgr_wq);
-}
-
-
-/*
- * socket callback functions
- */
-
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("ceph_data_ready on %p state = %lu, queueing work\n",
-		     con, con->state);
-		queue_con(con);
-	}
-}
-
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-
-	/* only queue to workqueue if there is data we want to write. */
-	if (test_bit(WRITE_PENDING, &con->state)) {
-		dout("ceph_write_space %p queueing write work\n", con);
-		queue_con(con);
-	} else {
-		dout("ceph_write_space %p nothing to write\n", con);
-	}
-
-	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
-	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
-
-	dout("ceph_state_change %p state = %lu sk_state = %u\n",
-	     con, con->state, sk->sk_state);
-
-	if (test_bit(CLOSED, &con->state))
-		return;
-
-	switch (sk->sk_state) {
-	case TCP_CLOSE:
-		dout("ceph_state_change TCP_CLOSE\n");
-	case TCP_CLOSE_WAIT:
-		dout("ceph_state_change TCP_CLOSE_WAIT\n");
-		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
-			if (test_bit(CONNECTING, &con->state))
-				con->error_msg = "connection failed";
-			else
-				con->error_msg = "socket closed";
-			queue_con(con);
-		}
-		break;
-	case TCP_ESTABLISHED:
-		dout("ceph_state_change TCP_ESTABLISHED\n");
-		queue_con(con);
-		break;
-	}
-}
-
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
-			       struct ceph_connection *con)
-{
-	struct sock *sk = sock->sk;
-	sk->sk_user_data = (void *)con;
-	sk->sk_data_ready = ceph_data_ready;
-	sk->sk_write_space = ceph_write_space;
-	sk->sk_state_change = ceph_state_change;
-}
-
-
-/*
- * socket helpers
- */
-
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
-	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
-	struct socket *sock;
-	int ret;
-
-	BUG_ON(con->sock);
-	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-			       IPPROTO_TCP, &sock);
-	if (ret)
-		return ERR_PTR(ret);
-	con->sock = sock;
-	sock->sk->sk_allocation = GFP_NOFS;
-
-#ifdef CONFIG_LOCKDEP
-	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-
-	set_sock_callbacks(sock, con);
-
-	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-
-	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
-				 O_NONBLOCK);
-	if (ret == -EINPROGRESS) {
-		dout("connect %s EINPROGRESS sk_state = %u\n",
-		     pr_addr(&con->peer_addr.in_addr),
-		     sock->sk->sk_state);
-		ret = 0;
-	}
-	if (ret < 0) {
-		pr_err("connect %s error %d\n",
-		       pr_addr(&con->peer_addr.in_addr), ret);
-		sock_release(sock);
-		con->sock = NULL;
-		con->error_msg = "connect error";
-	}
-
-	if (ret < 0)
-		return ERR_PTR(ret);
-	return sock;
-}
-
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-		     size_t kvlen, size_t len, int more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-
-
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
-	int rc;
-
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
-	if (!con->sock)
-		return 0;
-	set_bit(SOCK_CLOSED, &con->state);
-	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-	sock_release(con->sock);
-	con->sock = NULL;
-	clear_bit(SOCK_CLOSED, &con->state);
-	return rc;
-}
-
-/*
- * Reset a connection.  Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
-	list_del_init(&msg->list_head);
-	ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
-	while (!list_empty(head)) {
-		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
-							list_head);
-		ceph_msg_remove(msg);
-	}
-}
-
-static void reset_connection(struct ceph_connection *con)
-{
-	/* reset connection, out_queue, msg_ and connect_seq */
-	/* discard existing out_queue and msg_seq */
-	ceph_msg_remove_list(&con->out_queue);
-	ceph_msg_remove_list(&con->out_sent);
-
-	if (con->in_msg) {
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
-	con->connect_seq = 0;
-	con->out_seq = 0;
-	if (con->out_msg) {
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
-	con->out_keepalive_pending = false;
-	con->in_seq = 0;
-	con->in_seq_acked = 0;
-}
-
-/*
- * mark a peer down.  drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
-	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
-	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-	clear_bit(KEEPALIVE_PENDING, &con->state);
-	clear_bit(WRITE_PENDING, &con->state);
-	mutex_lock(&con->mutex);
-	reset_connection(con);
-	con->peer_global_seq = 0;
-	cancel_delayed_work(&con->work);
-	mutex_unlock(&con->mutex);
-	queue_con(con);
-}
-
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
-	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
-	set_bit(OPENING, &con->state);
-	clear_bit(CLOSED, &con->state);
-	memcpy(&con->peer_addr, addr, sizeof(*addr));
-	con->delay = 0;      /* reset backoff memory */
-	queue_con(con);
-}
-
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
-	return con->connect_seq > 0;
-}
-
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-	dout("con_get %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-	if (atomic_inc_not_zero(&con->nref))
-		return con;
-	return NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
-	dout("con_put %p nref = %d -> %d\n", con,
-	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-	BUG_ON(atomic_read(&con->nref) == 0);
-	if (atomic_dec_and_test(&con->nref)) {
-		BUG_ON(con->sock);
-		kfree(con);
-	}
-}
-
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
-	dout("con_init %p\n", con);
-	memset(con, 0, sizeof(*con));
-	atomic_set(&con->nref, 1);
-	con->msgr = msgr;
-	mutex_init(&con->mutex);
-	INIT_LIST_HEAD(&con->out_queue);
-	INIT_LIST_HEAD(&con->out_sent);
-	INIT_DELAYED_WORK(&con->work, con_work);
-}
-
-
-/*
- * We maintain a global counter to order connection attempts.  Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
-	u32 ret;
-
-	spin_lock(&msgr->global_seq_lock);
-	if (msgr->global_seq < gt)
-		msgr->global_seq = gt;
-	ret = ++msgr->global_seq;
-	spin_unlock(&msgr->global_seq_lock);
-	return ret;
-}
-
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con->out_kvec_is_msg = true;
-	con->out_kvec[v].iov_base = &m->footer;
-	con->out_kvec[v].iov_len = sizeof(m->footer);
-	con->out_kvec_bytes += sizeof(m->footer);
-	con->out_kvec_left++;
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	int v = 0;
-
-	con->out_kvec_bytes = 0;
-	con->out_kvec_is_msg = true;
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con->out_kvec[v].iov_base = &tag_ack;
-		con->out_kvec[v++].iov_len = 1;
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con->out_kvec[v].iov_base = &con->out_temp_ack;
-		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-	}
-
-	m = list_first_entry(&con->out_queue,
-		       struct ceph_msg, list_head);
-	con->out_msg = m;
-	if (test_bit(LOSSYTX, &con->state)) {
-		list_del_init(&m->list_head);
-	} else {
-		/* put message on sent list */
-		ceph_msg_get(m);
-		list_move_tail(&m->list_head, &con->out_sent);
-	}
-
-	/*
-	 * only assign outgoing seq # if we haven't sent this message
-	 * yet.  if it is requeued, resend with it's original seq.
-	 */
-	if (m->needs_out_seq) {
-		m->hdr.seq = cpu_to_le64(++con->out_seq);
-		m->needs_out_seq = false;
-	}
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len),
-	     m->nr_pages);
-	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-
-	/* tag + hdr + front + middle */
-	con->out_kvec[v].iov_base = &tag_msg;
-	con->out_kvec[v++].iov_len = 1;
-	con->out_kvec[v].iov_base = &m->hdr;
-	con->out_kvec[v++].iov_len = sizeof(m->hdr);
-	con->out_kvec[v++] = m->front;
-	if (m->middle)
-		con->out_kvec[v++] = m->middle->vec;
-	con->out_kvec_left = v;
-	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-		(m->middle ? m->middle->vec.iov_len : 0);
-	con->out_kvec_cur = con->out_kvec;
-
-	/* fill in crc (except data pages), footer */
-	con->out_msg->hdr.crc =
-		cpu_to_le32(crc32c(0, (void *)&m->hdr,
-				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
-	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-	con->out_msg->footer.front_crc =
-		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-	if (m->middle)
-		con->out_msg->footer.middle_crc =
-			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-					   m->middle->vec.iov_len));
-	else
-		con->out_msg->footer.middle_crc = 0;
-	con->out_msg->footer.data_crc = 0;
-	dout("prepare_write_message front_crc %u data_crc %u\n",
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-
-	/* is there a data payload? */
-	if (le32_to_cpu(m->hdr.data_len) > 0) {
-		/* initialize page iterator */
-		con->out_msg_pos.page = 0;
-		con->out_msg_pos.page_pos =
-			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
-		con->out_msg_pos.data_pos = 0;
-		con->out_msg_pos.did_page_crc = 0;
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con, v);
-	}
-
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con->out_kvec[0].iov_base = &tag_ack;
-	con->out_kvec[0].iov_len = 1;
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con->out_kvec[1].iov_base = &con->out_temp_ack;
-	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con->out_kvec[0].iov_base = &tag_keepalive;
-	con->out_kvec[0].iov_len = 1;
-	con->out_kvec_left = 1;
-	con->out_kvec_bytes = 1;
-	con->out_kvec_cur = con->out_kvec;
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Connection negotiation.
- */
-
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
-	void *auth_buf;
-	int auth_len = 0;
-	int auth_protocol = 0;
-
-	mutex_unlock(&con->mutex);
-	if (con->ops->get_authorizer)
-		con->ops->get_authorizer(con, &auth_buf, &auth_len,
-					 &auth_protocol, &con->auth_reply_buf,
-					 &con->auth_reply_buf_len,
-					 con->auth_retry);
-	mutex_lock(&con->mutex);
-
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-
-	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += auth_len;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-				 struct ceph_connection *con)
-{
-	int len = strlen(CEPH_BANNER);
-
-	con->out_kvec[0].iov_base = CEPH_BANNER;
-	con->out_kvec[0].iov_len = len;
-	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-	con->out_kvec_left = 2;
-	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect(struct ceph_messenger *msgr,
-				  struct ceph_connection *con,
-				  int after_banner)
-{
-	unsigned global_seq = get_global_seq(con->msgr, 0);
-	int proto;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	if (!after_banner) {
-		con->out_kvec_left = 0;
-		con->out_kvec_bytes = 0;
-	}
-	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-	con->out_kvec_left++;
-	con->out_kvec_bytes += sizeof(con->out_connect);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
-
-	prepare_connect_authorizer(con);
-}
-
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-		while (ret > 0) {
-			if (ret >= con->out_kvec_cur->iov_len) {
-				ret -= con->out_kvec_cur->iov_len;
-				con->out_kvec_cur++;
-				con->out_kvec_left--;
-			} else {
-				con->out_kvec_cur->iov_len -= ret;
-				con->out_kvec_cur->iov_base += ret;
-				ret = 0;
-				break;
-			}
-		}
-	}
-	con->out_kvec_left = 0;
-	con->out_kvec_is_msg = false;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
-	size_t len;
-	int crc = con->msgr->nocrc;
-	int ret;
-
-	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
-	     con->out_msg_pos.page_pos);
-
-	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
-		struct page *page = NULL;
-		void *kaddr = NULL;
-
-		/*
-		 * if we are calculating the data crc (the default), we need
-		 * to map the page.  if our pages[] has been revoked, use the
-		 * zero page.
-		 */
-		if (msg->pages) {
-			page = msg->pages[con->out_msg_pos.page];
-			if (crc)
-				kaddr = kmap(page);
-		} else if (msg->pagelist) {
-			page = list_first_entry(&msg->pagelist->head,
-						struct page, lru);
-			if (crc)
-				kaddr = kmap(page);
-		} else {
-			page = con->msgr->zero_page;
-			if (crc)
-				kaddr = page_address(con->msgr->zero_page);
-		}
-		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-			  (int)(data_len - con->out_msg_pos.data_pos));
-		if (crc && !con->out_msg_pos.did_page_crc) {
-			void *base = kaddr + con->out_msg_pos.page_pos;
-			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-
-			BUG_ON(kaddr == NULL);
-			con->out_msg->footer.data_crc =
-				cpu_to_le32(crc32c(tmpcrc, base, len));
-			con->out_msg_pos.did_page_crc = 1;
-		}
-
-		ret = kernel_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos, len,
-				      MSG_DONTWAIT | MSG_NOSIGNAL |
-				      MSG_MORE);
-
-		if (crc && (msg->pages || msg->pagelist))
-			kunmap(page);
-
-		if (ret <= 0)
-			goto out;
-
-		con->out_msg_pos.data_pos += ret;
-		con->out_msg_pos.page_pos += ret;
-		if (ret == len) {
-			con->out_msg_pos.page_pos = 0;
-			con->out_msg_pos.page++;
-			con->out_msg_pos.did_page_crc = 0;
-			if (msg->pagelist)
-				list_move_tail(&page->lru,
-					       &msg->pagelist->head);
-		}
-	}
-
-	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-
-	/* prepare and queue up footer, too */
-	if (!crc)
-		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_left = 0;
-	con->out_kvec_cur = con->out_kvec;
-	prepare_write_message_footer(con, 0);
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int ret;
-
-	while (con->out_skip > 0) {
-		struct kvec iov = {
-			.iov_base = page_address(con->msgr->zero_page),
-			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-		};
-
-		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int *to, int size, void *object)
-{
-	*to += size;
-	while (con->in_base_pos < *to) {
-		int left = *to - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int ret, to = 0;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
-			   &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
-			   &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int ret, to = 0;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
-	if (ret <= 0)
-		goto out;
-	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
-			   con->auth_reply_buf);
-	if (ret <= 0)
-		goto out;
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       pr_addr(&con->peer_addr.in_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
-	case AF_INET6:
-		return
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
-		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
-	}
-	return false;
-}
-
-static int addr_port(struct sockaddr_storage *ss)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		return ntohs(((struct sockaddr_in *)ss)->sin_port);
-	case AF_INET6:
-		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
-	}
-	return 0;
-}
-
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
-	switch (ss->ss_family) {
-	case AF_INET:
-		((struct sockaddr_in *)ss)->sin_port = htons(p);
-	case AF_INET6:
-		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
-	}
-}
-
-/*
- * Parse an ip[:port] list into an addr array.  Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
-		   struct ceph_entity_addr *addr,
-		   int max_count, int *count)
-{
-	int i;
-	const char *p = c;
-
-	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
-	for (i = 0; i < max_count; i++) {
-		const char *ipend;
-		struct sockaddr_storage *ss = &addr[i].in_addr;
-		struct sockaddr_in *in4 = (void *)ss;
-		struct sockaddr_in6 *in6 = (void *)ss;
-		int port;
-		char delim = ',';
-
-		if (*p == '[') {
-			delim = ']';
-			p++;
-		}
-
-		memset(ss, 0, sizeof(*ss));
-		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-			     delim, &ipend))
-			ss->ss_family = AF_INET;
-		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-				  delim, &ipend))
-			ss->ss_family = AF_INET6;
-		else
-			goto bad;
-		p = ipend;
-
-		if (delim == ']') {
-			if (*p != ']') {
-				dout("missing matching ']'\n");
-				goto bad;
-			}
-			p++;
-		}
-
-		/* port? */
-		if (p < end && *p == ':') {
-			port = 0;
-			p++;
-			while (p < end && *p >= '0' && *p <= '9') {
-				port = (port * 10) + (*p - '0');
-				p++;
-			}
-			if (port > 65535 || port == 0)
-				goto bad;
-		} else {
-			port = CEPH_MON_PORT;
-		}
-
-		addr_set_port(ss, port);
-
-		dout("parse_ips got %s\n", pr_addr(ss));
-
-		if (p == end)
-			break;
-		if (*p != ',')
-			goto bad;
-		p++;
-	}
-
-	if (p != end)
-		goto bad;
-
-	if (count)
-		*count = i + 1;
-	return 0;
-
-bad:
-	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
-	return -EINVAL;
-}
-
-static int process_banner(struct ceph_connection *con)
-{
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	ceph_decode_addr(&con->actual_peer_addr);
-	ceph_decode_addr(&con->peer_addr_for_me);
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
-			   pr_addr(&con->peer_addr.in_addr),
-			   le64_to_cpu(con->peer_addr.nonce),
-			   pr_addr(&con->actual_peer_addr.in_addr),
-			   le64_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-		int port = addr_port(&con->msgr->inst.addr.in_addr);
-
-		memcpy(&con->msgr->inst.addr.in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(&con->msgr->inst.addr.in_addr, port);
-		encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     pr_addr(&con->msgr->inst.addr.in_addr));
-	}
-
-	set_bit(NEGOTIATING, &con->state);
-	prepare_read_connect(con);
-	return 0;
-}
-
-static void fail_protocol(struct ceph_connection *con)
-{
-	reset_connection(con);
-	set_bit(CLOSED, &con->state);  /* in case there's queued work */
-
-	mutex_unlock(&con->mutex);
-	if (con->ops->bad_proto)
-		con->ops->bad_proto(con);
-	mutex_lock(&con->mutex);
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
-	u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		fail_protocol(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		fail_protocol(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			reset_connection(con);
-			set_bit(CLOSED, &con->state);
-			return -1;
-		}
-		con->auth_retry = 1;
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_connect.connect_seq));
-		pr_err("%s%lld %s connection reset\n",
-		       ENTITY_NAME(con->peer_name),
-		       pr_addr(&con->peer_addr.in_addr));
-		reset_connection(con);
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_connect.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_connect.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_connect.global_seq));
-		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       pr_addr(&con->peer_addr.in_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			fail_protocol(con);
-			return -1;
-		}
-		clear_bit(CONNECTING, &con->state);
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(LOSSYTX, &con->state);
-
-		prepare_read_tag(con);
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		pr_err("process_connect peer connecting WAIT\n");
-
-	default:
-		pr_err("connect protocol error, will retry\n");
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int to = 0;
-
-	return read_partial(con, &to, sizeof(con->in_temp_ack),
-			    &con->in_temp_ack);
-}
-
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-	u64 seq;
-
-	while (!list_empty(&con->out_sent)) {
-		m = list_first_entry(&con->out_sent, struct ceph_msg,
-				     list_head);
-		seq = le64_to_cpu(m->hdr.seq);
-		if (seq > ack)
-			break;
-		dout("got ack for seq %llu type %d at %p\n", seq,
-		     le16_to_cpu(m->hdr.type), m);
-		ceph_msg_remove(m);
-	}
-	prepare_read_tag(con);
-}
-
-
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section, unsigned int sec_len,
-					u32 *crc)
-{
-	int left;
-	int ret;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-		if (section->iov_len == sec_len)
-			*crc = crc32c(0, section->iov_base,
-				      section->iov_len);
-	}
-
-	return 1;
-}
-
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip);
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	void *p;
-	int ret;
-	int to, left;
-	unsigned front_len, middle_len, data_len, data_off;
-	int datacrc = con->msgr->nocrc;
-	int skip;
-	u64 seq;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	while (con->in_base_pos < sizeof(con->in_hdr)) {
-		left = sizeof(con->in_hdr) - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock,
-				       (char *)&con->in_hdr + con->in_base_pos,
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-		if (con->in_base_pos == sizeof(con->in_hdr)) {
-			u32 crc = crc32c(0, (void *)&con->in_hdr,
-				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-			if (crc != le32_to_cpu(con->in_hdr.crc)) {
-				pr_err("read_partial_message bad hdr "
-				       " crc %u != expected %u\n",
-				       crc, con->in_hdr.crc);
-				return -EBADMSG;
-			}
-		}
-	}
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-	data_off = le16_to_cpu(con->in_hdr.data_off);
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			pr_addr(&con->peer_addr.in_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof(m->footer);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
-		return 0;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADMSG;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     con->in_hdr.front_len, con->in_hdr.data_len);
-		skip = 0;
-		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			BUG_ON(con->in_msg);
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof(m->footer);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 0;
-		}
-		if (!con->in_msg) {
-			con->error_msg =
-				"error allocating memory for incoming message";
-			return -ENOMEM;
-		}
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		con->in_msg_pos.page = 0;
-		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-		con->in_msg_pos.data_pos = 0;
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec, middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* (page) data */
-	while (con->in_msg_pos.data_pos < data_len) {
-		left = min((int)(data_len - con->in_msg_pos.data_pos),
-			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-		BUG_ON(m->pages == NULL);
-		p = kmap(m->pages[con->in_msg_pos.page]);
-		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-				       left);
-		if (ret > 0 && datacrc)
-			con->in_data_crc =
-				crc32c(con->in_data_crc,
-					  p + con->in_msg_pos.page_pos, ret);
-		kunmap(m->pages[con->in_msg_pos.page]);
-		if (ret <= 0)
-			return ret;
-		con->in_msg_pos.data_pos += ret;
-		con->in_msg_pos.page_pos += ret;
-		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-			con->in_msg_pos.page_pos = 0;
-			con->in_msg_pos.page++;
-		}
-	}
-
-	/* footer */
-	to = sizeof(m->hdr) + sizeof(m->footer);
-	while (con->in_base_pos < to) {
-		left = to - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-				       (con->in_base_pos - sizeof(m->hdr)),
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
-/*
- * Process message.  This happens in the worker thread.  The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
-	struct ceph_msg *msg;
-
-	msg = con->in_msg;
-	con->in_msg = NULL;
-
-	/* if first message, set peer_name */
-	if (con->peer_name.type == 0)
-		con->peer_name = msg->hdr.src;
-
-	con->in_seq++;
-	mutex_unlock(&con->mutex);
-
-	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
-	     msg, le64_to_cpu(msg->hdr.seq),
-	     ENTITY_NAME(msg->hdr.src),
-	     le16_to_cpu(msg->hdr.type),
-	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-	     le32_to_cpu(msg->hdr.front_len),
-	     le32_to_cpu(msg->hdr.data_len),
-	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
-	con->ops->dispatch(con, msg);
-
-	mutex_lock(&con->mutex);
-	prepare_read_tag(con);
-}
-
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-	struct ceph_messenger *msgr = con->msgr;
-	int ret = 1;
-
-	dout("try_write start %p state %lu nref %d\n", con, con->state,
-	     atomic_read(&con->nref));
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-
-	/* open the socket first? */
-	if (con->sock == NULL) {
-		/*
-		 * if we were STANDBY and are reconnecting _this_
-		 * connection, bump connect_seq now.  Always bump
-		 * global_seq.
-		 */
-		if (test_and_clear_bit(STANDBY, &con->state))
-			con->connect_seq++;
-
-		prepare_write_banner(msgr, con);
-		prepare_write_connect(msgr, con, 1);
-		prepare_read_banner(con);
-		set_bit(CONNECTING, &con->state);
-		clear_bit(NEGOTIATING, &con->state);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
-		     con, con->state);
-		con->sock = ceph_tcp_connect(con);
-		if (IS_ERR(con->sock)) {
-			con->sock = NULL;
-			con->error_msg = "connect error";
-			ret = -1;
-			goto out;
-		}
-	}
-
-more_kvec:
-	/* kvec data queued? */
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_skip err %d\n", ret);
-			goto done;
-		}
-	}
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto done;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_msg_pages(con);
-		if (ret == 1)
-			goto more_kvec;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_msg_pages err %d\n",
-			     ret);
-			goto done;
-		}
-	}
-
-do_next:
-	if (!test_bit(CONNECTING, &con->state)) {
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	clear_bit(WRITE_PENDING, &con->state);
-	dout("try_write nothing else to write.\n");
-done:
-	ret = 0;
-out:
-	dout("try_write done on %p\n", con);
-	return ret;
-}
-
-
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-	if (!con->sock)
-		return 0;
-
-	if (test_bit(STANDBY, &con->state))
-		return 0;
-
-	dout("try_read start on %p\n", con);
-
-more:
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-	if (test_bit(CONNECTING, &con->state)) {
-		if (!test_bit(NEGOTIATING, &con->state)) {
-			dout("try_read connecting\n");
-			ret = read_partial_banner(con);
-			if (ret <= 0)
-				goto done;
-			if (process_banner(con) < 0) {
-				ret = -1;
-				goto out;
-			}
-		}
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto done;
-		if (process_connect(con) < 0) {
-			ret = -1;
-			goto out;
-		}
-		goto more;
-	}
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 *
-		 * FIXME: there must be a better way to do this!
-		 */
-		static char buf[1024];
-		int skip = min(1024, -con->in_base_pos);
-		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
-		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
-		if (ret <= 0)
-			goto done;
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto done;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			set_bit(CLOSED, &con->state);   /* fixme */
-			goto done;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc";
-				ret = -EIO;
-				goto out;
-			case -EIO:
-				con->error_msg = "io error";
-				goto out;
-			default:
-				goto done;
-			}
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		process_message(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto done;
-		process_ack(con);
-		goto more;
-	}
-
-done:
-	ret = 0;
-out:
-	dout("try_read done on %p\n", con);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
-/*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
-	if (test_bit(DEAD, &con->state)) {
-		dout("queue_con %p ignoring: DEAD\n",
-		     con);
-		return;
-	}
-
-	if (!con->ops->get(con)) {
-		dout("queue_con %p ref count 0\n", con);
-		return;
-	}
-
-	set_bit(QUEUED, &con->state);
-	if (test_bit(BUSY, &con->state)) {
-		dout("queue_con %p - already BUSY\n", con);
-		con->ops->put(con);
-	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
-		dout("queue_con %p - already queued\n", con);
-		con->ops->put(con);
-	} else {
-		dout("queue_con %p\n", con);
-	}
-}
-
-/*
- * Do some work on a connection.  Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
-	struct ceph_connection *con = container_of(work, struct ceph_connection,
-						   work.work);
-	int backoff = 0;
-
-more:
-	if (test_and_set_bit(BUSY, &con->state) != 0) {
-		dout("con_work %p BUSY already set\n", con);
-		goto out;
-	}
-	dout("con_work %p start, clearing QUEUED\n", con);
-	clear_bit(QUEUED, &con->state);
-
-	mutex_lock(&con->mutex);
-
-	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-		dout("con_work CLOSED\n");
-		con_close_socket(con);
-		goto done;
-	}
-	if (test_and_clear_bit(OPENING, &con->state)) {
-		/* reopen w/ new peer */
-		dout("con_work OPENING\n");
-		con_close_socket(con);
-	}
-
-	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
-	    try_read(con) < 0 ||
-	    try_write(con) < 0) {
-		mutex_unlock(&con->mutex);
-		backoff = 1;
-		ceph_fault(con);     /* error/fault path */
-		goto done_unlocked;
-	}
-
-done:
-	mutex_unlock(&con->mutex);
-
-done_unlocked:
-	clear_bit(BUSY, &con->state);
-	dout("con->state=%lu\n", con->state);
-	if (test_bit(QUEUED, &con->state)) {
-		if (!backoff || test_bit(OPENING, &con->state)) {
-			dout("con_work %p QUEUED reset, looping\n", con);
-			goto more;
-		}
-		dout("con_work %p QUEUED reset, but just faulted\n", con);
-		clear_bit(QUEUED, &con->state);
-	}
-	dout("con_work %p done\n", con);
-
-out:
-	con->ops->put(con);
-}
-
-
-/*
- * Generic error/fault handler.  A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
-	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
-	dout("fault %p state %lu to peer %s\n",
-	     con, con->state, pr_addr(&con->peer_addr.in_addr));
-
-	if (test_bit(LOSSYTX, &con->state)) {
-		dout("fault on LOSSYTX channel\n");
-		goto out;
-	}
-
-	mutex_lock(&con->mutex);
-	if (test_bit(CLOSED, &con->state))
-		goto out_unlock;
-
-	con_close_socket(con);
-
-	if (con->in_msg) {
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
-	/* Requeue anything that hasn't been acked */
-	list_splice_init(&con->out_sent, &con->out_queue);
-
-	/* If there are no messages in the queue, place the connection
-	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-		dout("fault setting STANDBY\n");
-		set_bit(STANDBY, &con->state);
-	} else {
-		/* retry after a delay. */
-		if (con->delay == 0)
-			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
-			con->delay *= 2;
-		dout("fault queueing %p delay %lu\n", con, con->delay);
-		con->ops->get(con);
-		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay)) == 0)
-			con->ops->put(con);
-	}
-
-out_unlock:
-	mutex_unlock(&con->mutex);
-out:
-	/*
-	 * in case we faulted due to authentication, invalidate our
-	 * current tickets so that we can get new ones.
-         */
-	if (con->auth_retry && con->ops->invalidate_authorizer) {
-		dout("calling invalidate_authorizer()\n");
-		con->ops->invalidate_authorizer(con);
-	}
-
-	if (con->ops->fault)
-		con->ops->fault(con);
-}
-
-
-
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
-	struct ceph_messenger *msgr;
-
-	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-	if (msgr == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock_init(&msgr->global_seq_lock);
-
-	/* the zero page is needed if a request is "canceled" while the message
-	 * is being written over the socket */
-	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-	if (!msgr->zero_page) {
-		kfree(msgr);
-		return ERR_PTR(-ENOMEM);
-	}
-	kmap(msgr->zero_page);
-
-	if (myaddr)
-		msgr->inst.addr = *myaddr;
-
-	/* select a random nonce */
-	msgr->inst.addr.type = 0;
-	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-	encode_my_addr(msgr);
-
-	dout("messenger_create %p\n", msgr);
-	return msgr;
-}
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-	dout("destroy %p\n", msgr);
-	kunmap(msgr->zero_page);
-	__free_page(msgr->zero_page);
-	kfree(msgr);
-	dout("destroyed messenger %p\n", msgr);
-}
-
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	if (test_bit(CLOSED, &con->state)) {
-		dout("con_send %p closed, dropping %p\n", con, msg);
-		ceph_msg_put(msg);
-		return;
-	}
-
-	/* set src+dst */
-	msg->hdr.src = con->msgr->inst.name;
-
-	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
-	msg->needs_out_seq = true;
-
-	/* queue */
-	mutex_lock(&con->mutex);
-	BUG_ON(!list_empty(&msg->list_head));
-	list_add_tail(&msg->list_head, &con->out_queue);
-	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
-	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
-	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-	     le32_to_cpu(msg->hdr.front_len),
-	     le32_to_cpu(msg->hdr.middle_len),
-	     le32_to_cpu(msg->hdr.data_len));
-	mutex_unlock(&con->mutex);
-
-	/* if there wasn't anything waiting to send before, queue
-	 * new work */
-	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-		queue_con(con);
-}
-
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("con_revoke %p msg %p - was on queue\n", con, msg);
-		list_del_init(&msg->list_head);
-		ceph_msg_put(msg);
-		msg->hdr.seq = 0;
-	}
-	if (con->out_msg == msg) {
-		dout("con_revoke %p msg %p - was sending\n", con, msg);
-		con->out_msg = NULL;
-		if (con->out_kvec_is_msg) {
-			con->out_skip = con->out_kvec_bytes;
-			con->out_kvec_is_msg = false;
-		}
-		ceph_msg_put(msg);
-		msg->hdr.seq = 0;
-	}
-	mutex_unlock(&con->mutex);
-}
-
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	mutex_lock(&con->mutex);
-	if (con->in_msg && con->in_msg == msg) {
-		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-			con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
-	} else {
-		dout("con_revoke_pages %p msg %p pages %p no-op\n",
-		     con, con->in_msg, msg);
-	}
-	mutex_unlock(&con->mutex);
-}
-
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
-	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-		queue_con(con);
-}
-
-
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
-	struct ceph_msg *m;
-
-	m = kmalloc(sizeof(*m), flags);
-	if (m == NULL)
-		goto out;
-	kref_init(&m->kref);
-	INIT_LIST_HEAD(&m->list_head);
-
-	m->hdr.tid = 0;
-	m->hdr.type = cpu_to_le16(type);
-	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-	m->hdr.version = 0;
-	m->hdr.front_len = cpu_to_le32(front_len);
-	m->hdr.middle_len = 0;
-	m->hdr.data_len = 0;
-	m->hdr.data_off = 0;
-	m->hdr.reserved = 0;
-	m->footer.front_crc = 0;
-	m->footer.middle_crc = 0;
-	m->footer.data_crc = 0;
-	m->footer.flags = 0;
-	m->front_max = front_len;
-	m->front_is_vmalloc = false;
-	m->more_to_follow = false;
-	m->pool = NULL;
-
-	/* front */
-	if (front_len) {
-		if (front_len > PAGE_CACHE_SIZE) {
-			m->front.iov_base = __vmalloc(front_len, flags,
-						      PAGE_KERNEL);
-			m->front_is_vmalloc = true;
-		} else {
-			m->front.iov_base = kmalloc(front_len, flags);
-		}
-		if (m->front.iov_base == NULL) {
-			pr_err("msg_new can't allocate %d bytes\n",
-			     front_len);
-			goto out2;
-		}
-	} else {
-		m->front.iov_base = NULL;
-	}
-	m->front.iov_len = front_len;
-
-	/* middle */
-	m->middle = NULL;
-
-	/* data */
-	m->nr_pages = 0;
-	m->pages = NULL;
-	m->pagelist = NULL;
-
-	dout("ceph_msg_new %p front %d\n", m, front_len);
-	return m;
-
-out2:
-	ceph_msg_put(m);
-out:
-	pr_err("msg_new can't create type %d front %d\n", type, front_len);
-	return NULL;
-}
-
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg.  This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	int type = le16_to_cpu(msg->hdr.type);
-	int middle_len = le32_to_cpu(msg->hdr.middle_len);
-
-	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
-	     ceph_msg_type_name(type), middle_len);
-	BUG_ON(!middle_len);
-	BUG_ON(msg->middle);
-
-	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
-	if (!msg->middle)
-		return -ENOMEM;
-	return 0;
-}
-
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr,
-				int *skip)
-{
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
-	int middle_len = le32_to_cpu(hdr->middle_len);
-	struct ceph_msg *msg = NULL;
-	int ret;
-
-	if (con->ops->alloc_msg) {
-		mutex_unlock(&con->mutex);
-		msg = con->ops->alloc_msg(con, hdr, skip);
-		mutex_lock(&con->mutex);
-		if (!msg || *skip)
-			return NULL;
-	}
-	if (!msg) {
-		*skip = 0;
-		msg = ceph_msg_new(type, front_len, GFP_NOFS);
-		if (!msg) {
-			pr_err("unable to allocate msg type %d len %d\n",
-			       type, front_len);
-			return NULL;
-		}
-	}
-	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-
-	if (middle_len && !msg->middle) {
-		ret = ceph_alloc_middle(con, msg);
-		if (ret < 0) {
-			ceph_msg_put(msg);
-			return NULL;
-		}
-	}
-
-	return msg;
-}
-
-
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
-	dout("msg_kfree %p\n", m);
-	if (m->front_is_vmalloc)
-		vfree(m->front.iov_base);
-	else
-		kfree(m->front.iov_base);
-	kfree(m);
-}
-
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
-	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-
-	dout("ceph_msg_put last one on %p\n", m);
-	WARN_ON(!list_empty(&m->list_head));
-
-	/* drop middle, data, if any */
-	if (m->middle) {
-		ceph_buffer_put(m->middle);
-		m->middle = NULL;
-	}
-	m->nr_pages = 0;
-	m->pages = NULL;
-
-	if (m->pagelist) {
-		ceph_pagelist_release(m->pagelist);
-		kfree(m->pagelist);
-		m->pagelist = NULL;
-	}
-
-	if (m->pool)
-		ceph_msgpool_put(m->pool, m);
-	else
-		ceph_msg_kfree(m);
-}
-
-void ceph_msg_dump(struct ceph_msg *msg)
-{
-	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-		 msg->front_max, msg->nr_pages);
-	print_hex_dump(KERN_DEBUG, "header: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       &msg->hdr, sizeof(msg->hdr), true);
-	print_hex_dump(KERN_DEBUG, " front: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       msg->front.iov_base, msg->front.iov_len, true);
-	if (msg->middle)
-		print_hex_dump(KERN_DEBUG, "middle: ",
-			       DUMP_PREFIX_OFFSET, 16, 1,
-			       msg->middle->vec.iov_base,
-			       msg->middle->vec.iov_len, true);
-	print_hex_dump(KERN_DEBUG, "footer: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc1..00000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-
-#include "types.h"
-#include "buffer.h"
-
-struct ceph_msg;
-struct ceph_connection;
-
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
-	struct ceph_connection *(*get)(struct ceph_connection *);
-	void (*put)(struct ceph_connection *);
-
-	/* handle an incoming message. */
-	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-
-	/* authorize an outgoing connection */
-	int (*get_authorizer) (struct ceph_connection *con,
-			       void **buf, int *len, int *proto,
-			       void **reply_buf, int *reply_len, int force_new);
-	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
-	int (*invalidate_authorizer)(struct ceph_connection *con);
-
-	/* protocol version mismatch */
-	void (*bad_proto) (struct ceph_connection *con);
-
-	/* there was some error on the socket (disconnect, whatever) */
-	void (*fault) (struct ceph_connection *con);
-
-	/* a remote host as terminated a message exchange session, and messages
-	 * we sent (or they tried to send us) may be lost. */
-	void (*peer_reset) (struct ceph_connection *con);
-
-	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-					struct ceph_msg_header *hdr,
-					int *skip);
-};
-
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-
-struct ceph_messenger {
-	struct ceph_entity_inst inst;    /* my name+address */
-	struct ceph_entity_addr my_enc_addr;
-	struct page *zero_page;          /* used in certain error cases */
-
-	bool nocrc;
-
-	/*
-	 * the global_seq counts connections i (attempt to) initiate
-	 * in order to disambiguate certain connect race conditions.
-	 */
-	u32 global_seq;
-	spinlock_t global_seq_lock;
-};
-
-/*
- * a single message.  it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
-	struct ceph_msg_header hdr;	/* header */
-	struct ceph_msg_footer footer;	/* footer */
-	struct kvec front;              /* unaligned blobs of message */
-	struct ceph_buffer *middle;
-	struct page **pages;            /* data payload.  NOT OWNER. */
-	unsigned nr_pages;              /* size of page array */
-	struct ceph_pagelist *pagelist; /* instead of pages */
-	struct list_head list_head;
-	struct kref kref;
-	bool front_is_vmalloc;
-	bool more_to_follow;
-	bool needs_out_seq;
-	int front_max;
-
-	struct ceph_msgpool *pool;
-};
-
-struct ceph_msg_pos {
-	int page, page_pos;  /* which page; offset in page */
-	int data_pos;        /* offset in data payload */
-	int did_page_crc;    /* true if we've calculated crc for current page */
-};
-
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL	(HZ/2)
-#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
-
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING	1
-#define NEGOTIATING	2
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING	4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
-#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
-			    * the ceph_connection around to maintain shared
-			    * state with the peer. */
-#define CLOSED		10 /* we've closed the connection */
-#define SOCK_CLOSED	11 /* socket state changed to closed */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
-
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
-	void *private;
-	atomic_t nref;
-
-	const struct ceph_connection_operations *ops;
-
-	struct ceph_messenger *msgr;
-	struct socket *sock;
-	unsigned long state;	/* connection state (see flags above) */
-	const char *error_msg;  /* error message, if any */
-
-	struct ceph_entity_addr peer_addr; /* peer address */
-	struct ceph_entity_name peer_name; /* peer name */
-	struct ceph_entity_addr peer_addr_for_me;
-	unsigned peer_features;
-	u32 connect_seq;      /* identify the most recent connection
-				 attempt for this connection, client */
-	u32 peer_global_seq;  /* peer's global seq for this connection */
-
-	int auth_retry;       /* true if we need a newer authorizer */
-	void *auth_reply_buf;   /* where to put the authorizer reply */
-	int auth_reply_buf_len;
-
-	struct mutex mutex;
-
-	/* out queue */
-	struct list_head out_queue;
-	struct list_head out_sent;   /* sending or sent but unacked */
-	u64 out_seq;		     /* last message queued for send */
-	bool out_keepalive_pending;
-
-	u64 in_seq, in_seq_acked;  /* last message received, acked */
-
-	/* connection negotiation temps */
-	char in_banner[CEPH_BANNER_MAX_LEN];
-	union {
-		struct {  /* outgoing connection */
-			struct ceph_msg_connect out_connect;
-			struct ceph_msg_connect_reply in_reply;
-		};
-		struct {  /* incoming */
-			struct ceph_msg_connect in_connect;
-			struct ceph_msg_connect_reply out_reply;
-		};
-	};
-	struct ceph_entity_addr actual_peer_addr;
-
-	/* message out temps */
-	struct ceph_msg *out_msg;        /* sending message (== tail of
-					    out_sent) */
-	bool out_msg_done;
-	struct ceph_msg_pos out_msg_pos;
-
-	struct kvec out_kvec[8],         /* sending header/footer data */
-		*out_kvec_cur;
-	int out_kvec_left;   /* kvec's left in out_kvec */
-	int out_skip;        /* skip this many bytes */
-	int out_kvec_bytes;  /* total bytes left */
-	bool out_kvec_is_msg; /* kvec refers to out_msg */
-	int out_more;        /* there is more data after the kvecs */
-	__le64 out_temp_ack; /* for writing an ack */
-
-	/* message in temps */
-	struct ceph_msg_header in_hdr;
-	struct ceph_msg *in_msg;
-	struct ceph_msg_pos in_msg_pos;
-	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
-
-	char in_tag;         /* protocol control byte */
-	int in_base_pos;     /* bytes read */
-	__le64 in_temp_ack;  /* for reading an ack */
-
-	struct delayed_work work;	    /* send|recv work */
-	unsigned long       delay;          /* current delay interval */
-};
-
-
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
-			  struct ceph_entity_addr *addr,
-			  int max_count, int *count);
-
-
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-
-extern struct ceph_messenger *ceph_messenger_create(
-	struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-
-extern void ceph_con_init(struct ceph_messenger *msgr,
-			  struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
-			  struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-				  struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-	kref_get(&msg->kref);
-	return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-	kref_put(&msg->kref, ceph_msg_last_put);
-}
-
-extern void ceph_msg_dump(struct ceph_msg *msg);
-
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index 54fe01c5070..00000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,882 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-
-/*
- * Interact with Ceph monitor cluster.  Handle requests for new map
- * versions, and periodically resend as needed.  Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information.  An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates.  We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure.  If the connection does break, we
- * randomly hunt for a new monitor.  Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-
-static const struct ceph_connection_operations mon_con_ops;
-
-static int __validate_auth(struct ceph_mon_client *monc);
-
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
-	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u16 version;
-	u32 len;
-
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
-
-	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
-	ceph_decode_16_safe(&p, end, version, bad);
-
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
-
-	num_mon = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-
-	if (num_mon >= CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-	for (i = 0; i < num_mon; i++)
-		ceph_decode_addr(&m->mon_inst[i].addr);
-
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     pr_addr(&m->mon_inst[i].addr.in_addr));
-	return m;
-
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
-}
-
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
-	int i;
-
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
-			return 1;
-	return 0;
-}
-
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-	monc->pending_auth = 1;
-	monc->m_auth->front.iov_len = len;
-	monc->m_auth->hdr.front_len = cpu_to_le32(len);
-	ceph_con_revoke(monc->con, monc->m_auth);
-	ceph_msg_get(monc->m_auth);  /* keep our ref */
-	ceph_con_send(monc->con, monc->m_auth);
-}
-
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
-	if (monc->con) {
-		dout("__close_session closing mon%d\n", monc->cur_mon);
-		ceph_con_revoke(monc->con, monc->m_auth);
-		ceph_con_close(monc->con);
-		monc->cur_mon = -1;
-		monc->pending_auth = 0;
-		ceph_auth_reset(monc->auth);
-	}
-}
-
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
-	char r;
-	int ret;
-
-	if (monc->cur_mon < 0) {
-		get_random_bytes(&r, 1);
-		monc->cur_mon = r % monc->monmap->num_mon;
-		dout("open_session num=%d r=%d -> mon%d\n",
-		     monc->monmap->num_mon, r, monc->cur_mon);
-		monc->sub_sent = 0;
-		monc->sub_renew_after = jiffies;  /* i.e., expired */
-		monc->want_next_osdmap = !!monc->want_next_osdmap;
-
-		dout("open_session mon%d opening\n", monc->cur_mon);
-		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-		ceph_con_open(monc->con,
-			      &monc->monmap->mon_inst[monc->cur_mon].addr);
-
-		/* initiatiate authentication handshake */
-		ret = ceph_auth_build_hello(monc->auth,
-					    monc->m_auth->front.iov_base,
-					    monc->m_auth->front_max);
-		__send_prepared_auth_request(monc, ret);
-	} else {
-		dout("open_session mon%d already open\n", monc->cur_mon);
-	}
-	return 0;
-}
-
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
-	return time_after_eq(jiffies, monc->sub_renew_after);
-}
-
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
-	unsigned delay;
-
-	if (monc->cur_mon < 0 || __sub_expired(monc))
-		delay = 10 * HZ;
-	else
-		delay = 20 * HZ;
-	dout("__schedule_delayed after %u\n", delay);
-	schedule_delayed_work(&monc->delayed_work, delay);
-}
-
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
-	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-	     (unsigned)monc->sub_sent, __sub_expired(monc),
-	     monc->want_next_osdmap);
-	if ((__sub_expired(monc) && !monc->sub_sent) ||
-	    monc->want_next_osdmap == 1) {
-		struct ceph_msg *msg = monc->m_subscribe;
-		struct ceph_mon_subscribe_item *i;
-		void *p, *end;
-
-		p = msg->front.iov_base;
-		end = p + msg->front_max;
-
-		dout("__send_subscribe to 'mdsmap' %u+\n",
-		     (unsigned)monc->have_mdsmap);
-		if (monc->want_next_osdmap) {
-			dout("__send_subscribe to 'osdmap' %u\n",
-			     (unsigned)monc->have_osdmap);
-			ceph_encode_32(&p, 3);
-			ceph_encode_string(&p, end, "osdmap", 6);
-			i = p;
-			i->have = cpu_to_le64(monc->have_osdmap);
-			i->onetime = 1;
-			p += sizeof(*i);
-			monc->want_next_osdmap = 2;  /* requested */
-		} else {
-			ceph_encode_32(&p, 2);
-		}
-		ceph_encode_string(&p, end, "mdsmap", 6);
-		i = p;
-		i->have = cpu_to_le64(monc->have_mdsmap);
-		i->onetime = 0;
-		p += sizeof(*i);
-		ceph_encode_string(&p, end, "monmap", 6);
-		i = p;
-		i->have = 0;
-		i->onetime = 0;
-		p += sizeof(*i);
-
-		msg->front.iov_len = p - msg->front.iov_base;
-		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		ceph_con_revoke(monc->con, msg);
-		ceph_con_send(monc->con, ceph_msg_get(msg));
-
-		monc->sub_sent = jiffies | 1;  /* never 0 */
-	}
-}
-
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
-				 struct ceph_msg *msg)
-{
-	unsigned seconds;
-	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-
-	if (msg->front.iov_len < sizeof(*h))
-		goto bad;
-	seconds = le32_to_cpu(h->duration);
-
-	mutex_lock(&monc->mutex);
-	if (monc->hunting) {
-		pr_info("mon%d %s session established\n",
-			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
-		monc->hunting = false;
-	}
-	dout("handle_subscribe_ack after %d seconds\n", seconds);
-	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-	monc->sub_sent = 0;
-	mutex_unlock(&monc->mutex);
-	return;
-bad:
-	pr_err("got corrupt subscribe-ack msg\n");
-	ceph_msg_dump(msg);
-}
-
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
-	mutex_lock(&monc->mutex);
-	monc->have_mdsmap = got;
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
-	mutex_lock(&monc->mutex);
-	monc->have_osdmap = got;
-	monc->want_next_osdmap = 0;
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-	dout("request_next_osdmap have %u\n", monc->have_osdmap);
-	mutex_lock(&monc->mutex);
-	if (!monc->want_next_osdmap)
-		monc->want_next_osdmap = 1;
-	if (monc->want_next_osdmap < 2)
-		__send_subscribe(monc);
-	mutex_unlock(&monc->mutex);
-}
-
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
-	if (!monc->con) {
-		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-		if (!monc->con)
-			return -ENOMEM;
-		ceph_con_init(monc->client->msgr, monc->con);
-		monc->con->private = monc;
-		monc->con->ops = &mon_con_ops;
-	}
-
-	mutex_lock(&monc->mutex);
-	__open_session(monc);
-	__schedule_delayed(monc);
-	mutex_unlock(&monc->mutex);
-	return 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
-				 struct ceph_msg *msg)
-{
-	struct ceph_client *client = monc->client;
-	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-	void *p, *end;
-
-	mutex_lock(&monc->mutex);
-
-	dout("handle_monmap\n");
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
-
-	monmap = ceph_monmap_decode(p, end);
-	if (IS_ERR(monmap)) {
-		pr_err("problem decoding monmap, %d\n",
-		       (int)PTR_ERR(monmap));
-		goto out;
-	}
-
-	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
-		kfree(monmap);
-		goto out;
-	}
-
-	client->monc.monmap = monmap;
-	kfree(old);
-
-out:
-	mutex_unlock(&monc->mutex);
-	wake_up_all(&client->auth_wq);
-}
-
-/*
- * statfs
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-
-static void release_generic_request(struct kref *kref)
-{
-	struct ceph_mon_generic_request *req =
-		container_of(kref, struct ceph_mon_generic_request, kref);
-
-	if (req->reply)
-		ceph_msg_put(req->reply);
-	if (req->request)
-		ceph_msg_put(req->request);
-
-	kfree(req);
-}
-
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
-	kref_put(&req->kref, release_generic_request);
-}
-
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
-	kref_get(&req->kref);
-}
-
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
-					 struct ceph_msg_header *hdr,
-					 int *skip)
-{
-	struct ceph_mon_client *monc = con->private;
-	struct ceph_mon_generic_request *req;
-	u64 tid = le64_to_cpu(hdr->tid);
-	struct ceph_msg *m;
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (!req) {
-		dout("get_generic_reply %lld dne\n", tid);
-		*skip = 1;
-		m = NULL;
-	} else {
-		dout("get_generic_reply %lld got %p\n", tid, req->reply);
-		m = ceph_msg_get(req->reply);
-		/*
-		 * we don't need to track the connection reading into
-		 * this reply because we only have one open connection
-		 * at a time, ever.
-		 */
-	}
-	mutex_unlock(&monc->mutex);
-	return m;
-}
-
-static void handle_statfs_reply(struct ceph_mon_client *monc,
-				struct ceph_msg *msg)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-	u64 tid = le64_to_cpu(msg->hdr.tid);
-
-	if (msg->front.iov_len != sizeof(*reply))
-		goto bad;
-	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-
-	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
-	if (req) {
-		*(struct ceph_statfs *)req->buf = reply->st;
-		req->result = 0;
-		get_generic_request(req);
-	}
-	mutex_unlock(&monc->mutex);
-	if (req) {
-		complete_all(&req->completion);
-		put_generic_request(req);
-	}
-	return;
-
-bad:
-	pr_err("corrupt generic reply, no tid\n");
-	ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-	struct ceph_mon_generic_request *req;
-	struct ceph_mon_statfs *h;
-	int err;
-
-	req = kzalloc(sizeof(*req), GFP_NOFS);
-	if (!req)
-		return -ENOMEM;
-
-	kref_init(&req->kref);
-	req->buf = buf;
-	init_completion(&req->completion);
-
-	err = -ENOMEM;
-	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
-	if (!req->request)
-		goto out;
-	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
-	if (!req->reply)
-		goto out;
-
-	/* fill out request */
-	h = req->request->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	h->fsid = monc->monmap->fsid;
-
-	/* register request */
-	mutex_lock(&monc->mutex);
-	req->tid = ++monc->last_tid;
-	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
-	monc->num_generic_requests++;
-	mutex_unlock(&monc->mutex);
-
-	/* send request and wait */
-	ceph_con_send(monc->con, ceph_msg_get(req->request));
-	err = wait_for_completion_interruptible(&req->completion);
-
-	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
-	monc->num_generic_requests--;
-	mutex_unlock(&monc->mutex);
-
-	if (!err)
-		err = req->result;
-
-out:
-	kref_put(&req->kref, release_generic_request);
-	return err;
-}
-
-/*
- * Resend pending statfs requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *p;
-
-	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_mon_generic_request, node);
-		ceph_con_revoke(monc->con, req->request);
-		ceph_con_send(monc->con, ceph_msg_get(req->request));
-	}
-}
-
-/*
- * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM).  And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
-	struct ceph_mon_client *monc =
-		container_of(work, struct ceph_mon_client, delayed_work.work);
-
-	dout("monc delayed_work\n");
-	mutex_lock(&monc->mutex);
-	if (monc->hunting) {
-		__close_session(monc);
-		__open_session(monc);  /* continue hunting */
-	} else {
-		ceph_con_keepalive(monc->con);
-
-		__validate_auth(monc);
-
-		if (monc->auth->ops->is_authenticated(monc->auth))
-			__send_subscribe(monc);
-	}
-	__schedule_delayed(monc);
-	mutex_unlock(&monc->mutex);
-}
-
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
-	struct ceph_mount_args *args = monc->client->mount_args;
-	struct ceph_entity_addr *mon_addr = args->mon_addr;
-	int num_mon = args->num_mon;
-	int i;
-
-	/* build initial monmap */
-	monc->monmap = kzalloc(sizeof(*monc->monmap) +
-			       num_mon*sizeof(monc->monmap->mon_inst[0]),
-			       GFP_KERNEL);
-	if (!monc->monmap)
-		return -ENOMEM;
-	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
-	}
-	monc->monmap->num_mon = num_mon;
-	monc->have_fsid = false;
-
-	/* release addr memory */
-	kfree(args->mon_addr);
-	args->mon_addr = NULL;
-	args->num_mon = 0;
-	return 0;
-}
-
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
-	int err = 0;
-
-	dout("init\n");
-	memset(monc, 0, sizeof(*monc));
-	monc->client = cl;
-	monc->monmap = NULL;
-	mutex_init(&monc->mutex);
-
-	err = build_initial_monmap(monc);
-	if (err)
-		goto out;
-
-	monc->con = NULL;
-
-	/* authentication */
-	monc->auth = ceph_auth_init(cl->mount_args->name,
-				    cl->mount_args->secret);
-	if (IS_ERR(monc->auth))
-		return PTR_ERR(monc->auth);
-	monc->auth->want_keys =
-		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
-		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-
-	/* msgs */
-	err = -ENOMEM;
-	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-				     sizeof(struct ceph_mon_subscribe_ack),
-				     GFP_NOFS);
-	if (!monc->m_subscribe_ack)
-		goto out_monmap;
-
-	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-	if (!monc->m_subscribe)
-		goto out_subscribe_ack;
-
-	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-	if (!monc->m_auth_reply)
-		goto out_subscribe;
-
-	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
-	monc->pending_auth = 0;
-	if (!monc->m_auth)
-		goto out_auth_reply;
-
-	monc->cur_mon = -1;
-	monc->hunting = true;
-	monc->sub_renew_after = jiffies;
-	monc->sub_sent = 0;
-
-	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-	monc->generic_request_tree = RB_ROOT;
-	monc->num_generic_requests = 0;
-	monc->last_tid = 0;
-
-	monc->have_mdsmap = 0;
-	monc->have_osdmap = 0;
-	monc->want_next_osdmap = 1;
-	return 0;
-
-out_auth_reply:
-	ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
-	ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
-	ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
-	kfree(monc->monmap);
-out:
-	return err;
-}
-
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
-	dout("stop\n");
-	cancel_delayed_work_sync(&monc->delayed_work);
-
-	mutex_lock(&monc->mutex);
-	__close_session(monc);
-	if (monc->con) {
-		monc->con->private = NULL;
-		monc->con->ops->put(monc->con);
-		monc->con = NULL;
-	}
-	mutex_unlock(&monc->mutex);
-
-	ceph_auth_destroy(monc->auth);
-
-	ceph_msg_put(monc->m_auth);
-	ceph_msg_put(monc->m_auth_reply);
-	ceph_msg_put(monc->m_subscribe);
-	ceph_msg_put(monc->m_subscribe_ack);
-
-	kfree(monc->monmap);
-}
-
-static void handle_auth_reply(struct ceph_mon_client *monc,
-			      struct ceph_msg *msg)
-{
-	int ret;
-	int was_auth = 0;
-
-	mutex_lock(&monc->mutex);
-	if (monc->auth->ops)
-		was_auth = monc->auth->ops->is_authenticated(monc->auth);
-	monc->pending_auth = 0;
-	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-				     msg->front.iov_len,
-				     monc->m_auth->front.iov_base,
-				     monc->m_auth->front_max);
-	if (ret < 0) {
-		monc->client->auth_err = ret;
-		wake_up_all(&monc->client->auth_wq);
-	} else if (ret > 0) {
-		__send_prepared_auth_request(monc, ret);
-	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
-		dout("authenticated, starting session\n");
-
-		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-		monc->client->msgr->inst.name.num =
-					cpu_to_le64(monc->auth->global_id);
-
-		__send_subscribe(monc);
-		__resend_generic_request(monc);
-	}
-	mutex_unlock(&monc->mutex);
-}
-
-static int __validate_auth(struct ceph_mon_client *monc)
-{
-	int ret;
-
-	if (monc->pending_auth)
-		return 0;
-
-	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
-			      monc->m_auth->front_max);
-	if (ret <= 0)
-		return ret; /* either an error, or no need to authenticate */
-	__send_prepared_auth_request(monc, ret);
-	return 0;
-}
-
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
-	int ret;
-
-	mutex_lock(&monc->mutex);
-	ret = __validate_auth(monc);
-	mutex_unlock(&monc->mutex);
-	return ret;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	struct ceph_mon_client *monc = con->private;
-	int type = le16_to_cpu(msg->hdr.type);
-
-	if (!monc)
-		return;
-
-	switch (type) {
-	case CEPH_MSG_AUTH_REPLY:
-		handle_auth_reply(monc, msg);
-		break;
-
-	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		handle_subscribe_ack(monc, msg);
-		break;
-
-	case CEPH_MSG_STATFS_REPLY:
-		handle_statfs_reply(monc, msg);
-		break;
-
-	case CEPH_MSG_MON_MAP:
-		ceph_monc_handle_map(monc, msg);
-		break;
-
-	case CEPH_MSG_MDS_MAP:
-		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-		break;
-
-	case CEPH_MSG_OSD_MAP:
-		ceph_osdc_handle_map(&monc->client->osdc, msg);
-		break;
-
-	default:
-		pr_err("received unknown message type %d %s\n", type,
-		       ceph_msg_type_name(type));
-	}
-	ceph_msg_put(msg);
-}
-
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-				      struct ceph_msg_header *hdr,
-				      int *skip)
-{
-	struct ceph_mon_client *monc = con->private;
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
-	struct ceph_msg *m = NULL;
-
-	*skip = 0;
-
-	switch (type) {
-	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		m = ceph_msg_get(monc->m_subscribe_ack);
-		break;
-	case CEPH_MSG_STATFS_REPLY:
-		return get_generic_reply(con, hdr, skip);
-	case CEPH_MSG_AUTH_REPLY:
-		m = ceph_msg_get(monc->m_auth_reply);
-		break;
-	case CEPH_MSG_MON_MAP:
-	case CEPH_MSG_MDS_MAP:
-	case CEPH_MSG_OSD_MAP:
-		m = ceph_msg_new(type, front_len, GFP_NOFS);
-		break;
-	}
-
-	if (!m) {
-		pr_info("alloc_msg unknown type %d\n", type);
-		*skip = 1;
-	}
-	return m;
-}
-
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
-	struct ceph_mon_client *monc = con->private;
-
-	if (!monc)
-		return;
-
-	dout("mon_fault\n");
-	mutex_lock(&monc->mutex);
-	if (!con->private)
-		goto out;
-
-	if (monc->con && !monc->hunting)
-		pr_info("mon%d %s session lost, "
-			"hunting for new mon\n", monc->cur_mon,
-			pr_addr(&monc->con->peer_addr.in_addr));
-
-	__close_session(monc);
-	if (!monc->hunting) {
-		/* start hunting */
-		monc->hunting = true;
-		__open_session(monc);
-	} else {
-		/* already hunting, let's wait a bit */
-		__schedule_delayed(monc);
-	}
-out:
-	mutex_unlock(&monc->mutex);
-}
-
-static const struct ceph_connection_operations mon_con_ops = {
-	.get = ceph_con_get,
-	.put = ceph_con_put,
-	.dispatch = dispatch,
-	.fault = mon_fault,
-	.alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 174d794321d..00000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-
-#include "messenger.h"
-
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
-	struct ceph_fsid fsid;
-	u32 epoch;
-	u32 num_mon;
-	struct ceph_entity_inst mon_inst[0];
-};
-
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-
-
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
-					 int newmon);
-
-/* a pending monitor request */
-struct ceph_mon_request {
-	struct ceph_mon_client *monc;
-	struct delayed_work delayed_work;
-	unsigned long delay;
-	ceph_monc_request_func_t do_request;
-};
-
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
-	struct kref kref;
-	u64 tid;
-	struct rb_node node;
-	int result;
-	void *buf;
-	struct completion completion;
-	struct ceph_msg *request;  /* original request */
-	struct ceph_msg *reply;    /* and reply */
-};
-
-struct ceph_mon_client {
-	struct ceph_client *client;
-	struct ceph_monmap *monmap;
-
-	struct mutex mutex;
-	struct delayed_work delayed_work;
-
-	struct ceph_auth_client *auth;
-	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
-	int pending_auth;
-
-	bool hunting;
-	int cur_mon;                       /* last monitor i contacted */
-	unsigned long sub_sent, sub_renew_after;
-	struct ceph_connection *con;
-	bool have_fsid;
-
-	/* pending generic requests */
-	struct rb_root generic_request_tree;
-	int num_generic_requests;
-	u64 last_tid;
-
-	/* mds/osd map */
-	int want_next_osdmap; /* 1 = want, 2 = want+asked */
-	u32 have_osdmap, have_mdsmap;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_file;
-#endif
-};
-
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
-				struct ceph_entity_addr *addr);
-
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-			       struct ceph_statfs *buf);
-
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-
-
-
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a643813..00000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-
-#include "msgpool.h"
-
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
-	struct ceph_msgpool *pool = arg;
-	void *p;
-
-	p = ceph_msg_new(0, pool->front_len, gfp_mask);
-	if (!p)
-		pr_err("msgpool %s alloc failed\n", pool->name);
-	return p;
-}
-
-static void free_fn(void *element, void *arg)
-{
-	ceph_msg_put(element);
-}
-
-int ceph_msgpool_init(struct ceph_msgpool *pool,
-		      int front_len, int size, bool blocking, const char *name)
-{
-	pool->front_len = front_len;
-	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-	if (!pool->pool)
-		return -ENOMEM;
-	pool->name = name;
-	return 0;
-}
-
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
-	mempool_destroy(pool->pool);
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
-				  int front_len)
-{
-	if (front_len > pool->front_len) {
-		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-		       pool->name, front_len, pool->front_len);
-		WARN_ON(1);
-
-		/* try to alloc a fresh message */
-		return ceph_msg_new(0, front_len, GFP_NOFS);
-	}
-
-	return mempool_alloc(pool->pool, GFP_NOFS);
-}
-
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
-	/* reset msg front_len; user may have changed it */
-	msg->front.iov_len = pool->front_len;
-	msg->hdr.front_len = cpu_to_le32(pool->front_len);
-
-	kref_init(&msg->kref);  /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f936..00000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-
-#include <linux/mempool.h>
-#include "messenger.h"
-
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
-	const char *name;
-	mempool_t *pool;
-	int front_len;          /* preallocated payload size */
-};
-
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-			     int front_len, int size, bool blocking,
-			     const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
-					 int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 892a0298dfd..00000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef __MSGR_H
-#define __MSGR_H
-
-/*
- * Data types for message passing layer used by Ceph.
- */
-
-#define CEPH_MON_PORT    6789  /* default monitor port */
-
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
-/*
- * tcp connection banner.  include a protocol version. and adjust
- * whenever the wire protocol changes.  try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-
-
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
-       return (__s32)a - (__s32)b;
-}
-
-
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
-	__u8 type;      /* CEPH_ENTITY_TYPE_* */
-	__le64 num;
-} __attribute__ ((packed));
-
-#define CEPH_ENTITY_TYPE_MON    0x01
-#define CEPH_ENTITY_TYPE_MDS    0x02
-#define CEPH_ENTITY_TYPE_OSD    0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH   0x20
-
-#define CEPH_ENTITY_TYPE_ANY    0xFF
-
-extern const char *ceph_entity_type_name(int type);
-
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
-	__le32 type;
-	__le32 nonce;  /* unique id for process (e.g. pid) */
-	struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-
-struct ceph_entity_inst {
-	struct ceph_entity_name name;
-	struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-
-
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
-					  incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
-					  with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
-					  with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
-#define CEPH_MSGR_TAG_MSG           7  /* message */
-#define CEPH_MSGR_TAG_ACK           8  /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
-
-
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
-	__le64 features;     /* supported feature bits */
-	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
-	__le32 global_seq;   /* count connections initiated by this host */
-	__le32 connect_seq;  /* count connections initiated in this session */
-	__le32 protocol_version;
-	__le32 authorizer_protocol;
-	__le32 authorizer_len;
-	__u8  flags;         /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-
-struct ceph_msg_connect_reply {
-	__u8 tag;
-	__le64 features;     /* feature bits for this session */
-	__le32 global_seq;
-	__le32 connect_seq;
-	__le32 protocol_version;
-	__le32 authorizer_len;
-	__u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
-
-
-/*
- * message header
- */
-struct ceph_msg_header_old {
-	__le64 seq;       /* message seq# for this session */
-	__le64 tid;       /* transaction id */
-	__le16 type;      /* message type */
-	__le16 priority;  /* priority.  higher value == higher priority */
-	__le16 version;   /* version of message encoding */
-
-	__le32 front_len; /* bytes in main payload */
-	__le32 middle_len;/* bytes in middle payload */
-	__le32 data_len;  /* bytes of data payload */
-	__le16 data_off;  /* sender: include full offset;
-			     receiver: mask against ~PAGE_MASK */
-
-	struct ceph_entity_inst src, orig_src;
-	__le32 reserved;
-	__le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-struct ceph_msg_header {
-	__le64 seq;       /* message seq# for this session */
-	__le64 tid;       /* transaction id */
-	__le16 type;      /* message type */
-	__le16 priority;  /* priority.  higher value == higher priority */
-	__le16 version;   /* version of message encoding */
-
-	__le32 front_len; /* bytes in main payload */
-	__le32 middle_len;/* bytes in middle payload */
-	__le32 data_len;  /* bytes of data payload */
-	__le16 data_off;  /* sender: include full offset;
-			     receiver: mask against ~PAGE_MASK */
-
-	struct ceph_entity_name src;
-	__le32 reserved;
-	__le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-#define CEPH_MSG_PRIO_LOW     64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH    196
-#define CEPH_MSG_PRIO_HIGHEST 255
-
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
-	__le32 front_crc, middle_crc, data_crc;
-	__u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
-
-
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index e3852234789..00000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1542 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-
-#define OSD_OP_FRONT_LEN	4096
-#define OSD_OPREPLY_FRONT_LEN	512
-
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd);
-
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly.  shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino, struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			struct ceph_osd_request *req)
-{
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_osd_op *op = (void *)(reqhead + 1);
-	u64 orig_len = *plen;
-	u64 objoff, objlen;    /* extent in object */
-	u64 bno;
-
-	reqhead->snapid = cpu_to_le64(vino.snap);
-
-	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, &bno,
-				      &objoff, &objlen);
-	if (*plen < orig_len)
-		dout(" skipping last %llu, final file extent %llu~%llu\n",
-		     orig_len - *plen, off, *plen);
-
-	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
-	req->r_oid_len = strlen(req->r_oid);
-
-	op->extent.offset = cpu_to_le64(objoff);
-	op->extent.length = cpu_to_le64(objlen);
-	req->r_num_pages = calc_pages_for(off, *plen);
-
-	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
-	struct ceph_osd_request *req = container_of(kref,
-						    struct ceph_osd_request,
-						    r_kref);
-
-	if (req->r_request)
-		ceph_msg_put(req->r_request);
-	if (req->r_reply)
-		ceph_msg_put(req->r_reply);
-	if (req->r_con_filling_msg) {
-		dout("release_request revoking pages %p from con %p\n",
-		     req->r_pages, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg,
-				      req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
-	}
-	if (req->r_own_pages)
-		ceph_release_page_vector(req->r_pages,
-					 req->r_num_pages);
-	ceph_put_snap_context(req->r_snapc);
-	if (req->r_mempool)
-		mempool_free(req, req->r_osdc->req_mempool);
-	else
-		kfree(req);
-}
-
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-					       struct ceph_file_layout *layout,
-					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
-					       int opcode, int flags,
-					       struct ceph_snap_context *snapc,
-					       int do_sync,
-					       u32 truncate_seq,
-					       u64 truncate_size,
-					       struct timespec *mtime,
-					       bool use_mempool, int num_reply)
-{
-	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	struct ceph_osd_request_head *head;
-	struct ceph_osd_op *op;
-	void *p;
-	int num_op = 1 + do_sync;
-	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int i;
-
-	if (use_mempool) {
-		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
-		memset(req, 0, sizeof(*req));
-	} else {
-		req = kzalloc(sizeof(*req), GFP_NOFS);
-	}
-	if (req == NULL)
-		return NULL;
-
-	req->r_osdc = osdc;
-	req->r_mempool = use_mempool;
-	kref_init(&req->r_kref);
-	init_completion(&req->r_completion);
-	init_completion(&req->r_safe_completion);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
-	req->r_flags = flags;
-
-	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
-
-	/* create request message; allow space for oid */
-	msg_size += 40;
-	if (snapc)
-		msg_size += sizeof(u64) * snapc->num_snaps;
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
-	memset(msg->front.iov_base, 0, msg->front.iov_len);
-	head = msg->front.iov_base;
-	op = (void *)(head + 1);
-	p = (void *)(op + num_op);
-
-	req->r_request = msg;
-	req->r_snapc = ceph_get_snap_context(snapc);
-
-	head->client_inc = cpu_to_le32(1); /* always, for now. */
-	head->flags = cpu_to_le32(flags);
-	if (flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(&head->mtime, mtime);
-	head->num_ops = cpu_to_le16(num_op);
-	op->op = cpu_to_le16(opcode);
-
-	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req);
-	req->r_file_layout = *layout;  /* keep a copy */
-
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		req->r_request->hdr.data_off = cpu_to_le16(off);
-		req->r_request->hdr.data_len = cpu_to_le32(*plen);
-		op->payload_len = cpu_to_le32(*plen);
-	}
-	op->extent.truncate_size = cpu_to_le64(truncate_size);
-	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
-
-	/* fill in oid */
-	head->object_len = cpu_to_le32(req->r_oid_len);
-	memcpy(p, req->r_oid, req->r_oid_len);
-	p += req->r_oid_len;
-
-	if (do_sync) {
-		op++;
-		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
-	}
-	if (snapc) {
-		head->snap_seq = cpu_to_le64(snapc->seq);
-		head->num_snaps = cpu_to_le32(snapc->num_snaps);
-		for (i = 0; i < snapc->num_snaps; i++) {
-			put_unaligned_le64(snapc->snaps[i], p);
-			p += sizeof(u64);
-		}
-	}
-
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
-	return req;
-}
-
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	kick_requests(osdc, osd);
-	up_read(&osdc->map_sem);
-}
-
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd *osd;
-
-	osd = kzalloc(sizeof(*osd), GFP_NOFS);
-	if (!osd)
-		return NULL;
-
-	atomic_set(&osd->o_ref, 1);
-	osd->o_osdc = osdc;
-	INIT_LIST_HEAD(&osd->o_requests);
-	INIT_LIST_HEAD(&osd->o_osd_lru);
-	osd->o_incarnation = 1;
-
-	ceph_con_init(osdc->client->msgr, &osd->o_con);
-	osd->o_con.private = osd;
-	osd->o_con.ops = &osd_con_ops;
-	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-
-	INIT_LIST_HEAD(&osd->o_keepalive_item);
-	return osd;
-}
-
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
-	if (atomic_inc_not_zero(&osd->o_ref)) {
-		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-		     atomic_read(&osd->o_ref));
-		return osd;
-	} else {
-		dout("get_osd %p FAIL\n", osd);
-		return NULL;
-	}
-}
-
-static void put_osd(struct ceph_osd *osd)
-{
-	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
-		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
-		if (osd->o_authorizer)
-			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
-		kfree(osd);
-	}
-}
-
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("__remove_osd %p\n", osd);
-	BUG_ON(!list_empty(&osd->o_requests));
-	rb_erase(&osd->o_node, &osdc->osds);
-	list_del_init(&osd->o_osd_lru);
-	ceph_con_close(&osd->o_con);
-	put_osd(osd);
-}
-
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-			      struct ceph_osd *osd)
-{
-	dout("__move_osd_to_lru %p\n", osd);
-	BUG_ON(!list_empty(&osd->o_osd_lru));
-	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
-	dout("__remove_osd_from_lru %p\n", osd);
-	if (!list_empty(&osd->o_osd_lru))
-		list_del_init(&osd->o_osd_lru);
-}
-
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
-	struct ceph_osd *osd, *nosd;
-
-	dout("__remove_old_osds %p\n", osdc);
-	mutex_lock(&osdc->request_mutex);
-	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-		if (!remove_all && time_before(jiffies, osd->lru_ttl))
-			break;
-		__remove_osd(osdc, osd);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	struct ceph_osd_request *req;
-	int ret = 0;
-
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests)) {
-		__remove_osd(osdc, osd);
-	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-			  &osd->o_con.peer_addr,
-			  sizeof(osd->o_con.peer_addr)) == 0 &&
-		   !ceph_con_opened(&osd->o_con)) {
-		dout(" osd addr hasn't changed and connection never opened,"
-		     " letting msgr retry");
-		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
-			req->r_stamp = jiffies;
-		ret = -EAGAIN;
-	} else {
-		ceph_con_close(&osd->o_con);
-		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
-		osd->o_incarnation++;
-	}
-	return ret;
-}
-
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
-
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
-}
-
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
-	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
-	cancel_delayed_work(&osdc->timeout_work);
-}
-
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *req)
-{
-	mutex_lock(&osdc->request_mutex);
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-
-	dout("register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-
-	if (osdc->num_requests == 1) {
-		dout(" first request, scheduling timeout\n");
-		__schedule_osd_timeout(osdc);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
-{
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	osdc->num_requests--;
-
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-
-		list_del_init(&req->r_osd_item);
-		if (list_empty(&req->r_osd->o_requests))
-			__move_osd_to_lru(osdc, req->r_osd);
-		req->r_osd = NULL;
-	}
-
-	ceph_osdc_put_request(req);
-
-	list_del_init(&req->r_req_lru_item);
-	if (osdc->num_requests == 0) {
-		dout(" no requests, canceling timeout\n");
-		__cancel_osd_timeout(osdc);
-	}
-}
-
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
-	if (req->r_sent) {
-		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-		req->r_sent = 0;
-	}
-	list_del_init(&req->r_req_lru_item);
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
-		      struct ceph_osd_request *req)
-{
-	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	struct ceph_pg pgid;
-	int acting[CEPH_PG_MAX_SIZE];
-	int o = -1, num = 0;
-	int err;
-
-	dout("map_osds %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
-				      &req->r_file_layout, osdc->osdmap);
-	if (err)
-		return err;
-	pgid = reqhead->layout.ol_pgid;
-	req->r_pgid = pgid;
-
-	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-	if (err > 0) {
-		o = acting[0];
-		num = err;
-	}
-
-	if ((req->r_osd && req->r_osd->o_osd == o &&
-	     req->r_sent >= req->r_osd->o_incarnation &&
-	     req->r_num_pg_osds == num &&
-	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-	    (req->r_osd == NULL && o == -1))
-		return 0;  /* no change */
-
-	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
-	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	/* record full pg acting set */
-	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-	req->r_num_pg_osds = num;
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = __lookup_osd(osdc, o);
-	if (!req->r_osd && o >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc);
-		if (!req->r_osd)
-			goto out;
-
-		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
-		req->r_osd->o_osd = o;
-		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
-		__insert_osd(osdc, req->r_osd);
-
-		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
-	}
-
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add(&req->r_osd_item, &req->r_osd->o_requests);
-	}
-	err = 1;   /* osd or pg changed */
-
-out:
-	return err;
-}
-
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
-			  struct ceph_osd_request *req)
-{
-	struct ceph_osd_request_head *reqhead;
-	int err;
-
-	err = __map_osds(osdc, req);
-	if (err < 0)
-		return err;
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-		return 0;
-	}
-
-	dout("send_request %p tid %llu to osd%d flags %d\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
-	reqhead = req->r_request->front.iov_base;
-	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-	reqhead->reassert_version = req->r_reassert_version;
-
-	req->r_stamp = jiffies;
-	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
-
-	ceph_msg_get(req->r_request); /* send consumes a ref */
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
-	req->r_sent = req->r_osd->o_incarnation;
-	return 0;
-}
-
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
-	struct ceph_osd_client *osdc =
-		container_of(work, struct ceph_osd_client, timeout_work.work);
-	struct ceph_osd_request *req, *last_req = NULL;
-	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-	unsigned long keepalive =
-		osdc->client->mount_args->osd_keepalive_timeout * HZ;
-	unsigned long last_stamp = 0;
-	struct rb_node *p;
-	struct list_head slow_osds;
-
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		if (req->r_resend) {
-			int err;
-
-			dout("osdc resending prev failed %lld\n", req->r_tid);
-			err = __send_request(osdc, req);
-			if (err)
-				dout("osdc failed again on %lld\n", req->r_tid);
-			else
-				req->r_resend = false;
-			continue;
-		}
-	}
-
-	/*
-	 * reset osds that appear to be _really_ unresponsive.  this
-	 * is a failsafe measure.. we really shouldn't be getting to
-	 * this point if the system is working properly.  the monitors
-	 * should mark the osd as failed and we should find out about
-	 * it from an updated osd map.
-	 */
-	while (timeout && !list_empty(&osdc->req_lru)) {
-		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-				 r_req_lru_item);
-
-		if (time_before(jiffies, req->r_stamp + timeout))
-			break;
-
-		BUG_ON(req == last_req && req->r_stamp == last_stamp);
-		last_req = req;
-		last_stamp = req->r_stamp;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-			   req->r_tid, osd->o_osd);
-		__kick_requests(osdc, osd);
-	}
-
-	/*
-	 * ping osds that are a bit slow.  this ensures that if there
-	 * is a break in the TCP connection we will notice, and reopen
-	 * a connection with that osd (from the fault callback).
-	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_stamp + keepalive))
-			break;
-
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
-	}
-	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
-		list_del_init(&osd->o_keepalive_item);
-		ceph_con_keepalive(&osd->o_con);
-	}
-
-	__schedule_osd_timeout(osdc);
-	mutex_unlock(&osdc->request_mutex);
-
-	up_read(&osdc->map_sem);
-}
-
-static void handle_osds_timeout(struct work_struct *work)
-{
-	struct ceph_osd_client *osdc =
-		container_of(work, struct ceph_osd_client,
-			     osds_timeout_work.work);
-	unsigned long delay =
-		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-
-	dout("osds timeout\n");
-	down_read(&osdc->map_sem);
-	remove_old_osds(osdc, 0);
-	up_read(&osdc->map_sem);
-
-	schedule_delayed_work(&osdc->osds_timeout_work,
-			      round_jiffies_relative(delay));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
-			 struct ceph_connection *con)
-{
-	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
-	struct ceph_osd_request *req;
-	u64 tid;
-	int numops, object_len, flags;
-	s32 result;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	if (msg->front.iov_len < sizeof(*rhead))
-		goto bad;
-	numops = le32_to_cpu(rhead->num_ops);
-	object_len = le32_to_cpu(rhead->object_len);
-	result = le32_to_cpu(rhead->result);
-	if (msg->front.iov_len != sizeof(*rhead) + object_len +
-	    numops * sizeof(struct ceph_osd_op))
-		goto bad;
-	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-
-	/* lookup */
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		mutex_unlock(&osdc->request_mutex);
-		return;
-	}
-	ceph_osdc_get_request(req);
-	flags = le32_to_cpu(rhead->flags);
-
-	/*
-	 * if this connection filled our message, drop our reference now, to
-	 * avoid a (safe but slower) revoke later.
-	 */
-	if (req->r_con_filling_msg == con && req->r_reply == msg) {
-		dout(" dropping con_filling_msg ref %p\n", con);
-		req->r_con_filling_msg = NULL;
-		ceph_con_put(con);
-	}
-
-	if (!req->r_got_reply) {
-		unsigned bytes;
-
-		req->r_result = le32_to_cpu(rhead->result);
-		bytes = le32_to_cpu(msg->hdr.data_len);
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
-
-		/* in case this is a write and we need to replay, */
-		req->r_reassert_version = rhead->reassert_version;
-
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		mutex_unlock(&osdc->request_mutex);
-		goto done;
-	}
-
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-		__unregister_request(osdc, req);
-
-	mutex_unlock(&osdc->request_mutex);
-
-	if (req->r_callback)
-		req->r_callback(req, msg);
-	else
-		complete_all(&req->r_completion);
-
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_safe_callback)
-			req->r_safe_callback(req, msg);
-		complete_all(&req->r_safe_completion);  /* fsync waiter */
-	}
-
-done:
-	ceph_osdc_put_request(req);
-	return;
-
-bad:
-	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-	       (int)sizeof(*rhead));
-	ceph_msg_dump(msg);
-}
-
-
-static int __kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *p, *n;
-	int needmap = 0;
-	int err;
-
-	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-	if (kickosd) {
-		err = __reset_osd(osdc, kickosd);
-		if (err == -EAGAIN)
-			return 1;
-	} else {
-		for (p = rb_first(&osdc->osds); p; p = n) {
-			struct ceph_osd *osd =
-				rb_entry(p, struct ceph_osd, o_node);
-
-			n = rb_next(p);
-			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-			    memcmp(&osd->o_con.peer_addr,
-				   ceph_osd_addr(osdc->osdmap,
-						 osd->o_osd),
-				   sizeof(struct ceph_entity_addr)) != 0)
-				__reset_osd(osdc, osd);
-		}
-	}
-
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-
-		if (req->r_resend) {
-			dout(" r_resend set on tid %llu\n", req->r_tid);
-			__cancel_request(req);
-			goto kick;
-		}
-		if (req->r_osd && kickosd == req->r_osd) {
-			__cancel_request(req);
-			goto kick;
-		}
-
-		err = __map_osds(osdc, req);
-		if (err == 0)
-			continue;  /* no change */
-		if (err < 0) {
-			/*
-			 * FIXME: really, we should set the request
-			 * error and fail if this isn't a 'nofail'
-			 * request, but that's a fair bit more
-			 * complicated to do.  So retry!
-			 */
-			dout(" setting r_resend on %llu\n", req->r_tid);
-			req->r_resend = true;
-			continue;
-		}
-		if (req->r_osd == NULL) {
-			dout("tid %llu maps to no valid osd\n", req->r_tid);
-			needmap++;  /* request a newer map */
-			continue;
-		}
-
-kick:
-		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-		     req->r_osd ? req->r_osd->o_osd : -1);
-		req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		err = __send_request(osdc, req);
-		if (err) {
-			dout(" setting r_resend on %llu\n", req->r_tid);
-			req->r_resend = true;
-		}
-	}
-
-	return needmap;
-}
-
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
-			  struct ceph_osd *kickosd)
-{
-	int needmap;
-
-	mutex_lock(&osdc->request_mutex);
-	needmap = __kick_requests(osdc, kickosd);
-	mutex_unlock(&osdc->request_mutex);
-
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-	}
-
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster.  Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
-	void *p, *end, *next;
-	u32 nr_maps, maplen;
-	u32 epoch;
-	struct ceph_osdmap *newmap = NULL, *oldmap;
-	int err;
-	struct ceph_fsid fsid;
-
-	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
-
-	/* verify fsid */
-	ceph_decode_need(&p, end, sizeof(fsid), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_check_fsid(osdc->client, &fsid) < 0)
-		return;
-
-	down_write(&osdc->map_sem);
-
-	/* incremental maps */
-	ceph_decode_32_safe(&p, end, nr_maps, bad);
-	dout(" %d inc maps\n", nr_maps);
-	while (nr_maps > 0) {
-		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		epoch = ceph_decode_32(&p);
-		maplen = ceph_decode_32(&p);
-		ceph_decode_need(&p, end, maplen, bad);
-		next = p + maplen;
-		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
-			dout("applying incremental map %u len %d\n",
-			     epoch, maplen);
-			newmap = osdmap_apply_incremental(&p, next,
-							  osdc->osdmap,
-							  osdc->client->msgr);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
-				goto bad;
-			}
-			BUG_ON(!newmap);
-			if (newmap != osdc->osdmap) {
-				ceph_osdmap_destroy(osdc->osdmap);
-				osdc->osdmap = newmap;
-			}
-		} else {
-			dout("ignoring incremental map %u len %d\n",
-			     epoch, maplen);
-		}
-		p = next;
-		nr_maps--;
-	}
-	if (newmap)
-		goto done;
-
-	/* full maps */
-	ceph_decode_32_safe(&p, end, nr_maps, bad);
-	dout(" %d full maps\n", nr_maps);
-	while (nr_maps) {
-		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		epoch = ceph_decode_32(&p);
-		maplen = ceph_decode_32(&p);
-		ceph_decode_need(&p, end, maplen, bad);
-		if (nr_maps > 1) {
-			dout("skipping non-latest full map %u len %d\n",
-			     epoch, maplen);
-		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
-			dout("skipping full map %u len %d, "
-			     "older than our %u\n", epoch, maplen,
-			     osdc->osdmap->epoch);
-		} else {
-			dout("taking full map %u len %d\n", epoch, maplen);
-			newmap = osdmap_decode(&p, p+maplen);
-			if (IS_ERR(newmap)) {
-				err = PTR_ERR(newmap);
-				goto bad;
-			}
-			BUG_ON(!newmap);
-			oldmap = osdc->osdmap;
-			osdc->osdmap = newmap;
-			if (oldmap)
-				ceph_osdmap_destroy(oldmap);
-		}
-		p += maplen;
-		nr_maps--;
-	}
-
-done:
-	downgrade_write(&osdc->map_sem);
-	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
-	if (newmap)
-		kick_requests(osdc, NULL);
-	up_read(&osdc->map_sem);
-	wake_up_all(&osdc->client->auth_wq);
-	return;
-
-bad:
-	pr_err("osdc handle_map corrupt msg\n");
-	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
-	return;
-}
-
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
-{
-	int rc = 0;
-
-	req->r_request->pages = req->r_pages;
-	req->r_request->nr_pages = req->r_num_pages;
-
-	register_request(osdc, req);
-
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	/*
-	 * a racing kick_requests() may have sent the message for us
-	 * while we dropped request_mutex above, so only send now if
-	 * the request still han't been touched yet.
-	 */
-	if (req->r_sent == 0) {
-		rc = __send_request(osdc, req);
-		if (rc) {
-			if (nofail) {
-				dout("osdc_start_request failed send, "
-				     " marking %lld\n", req->r_tid);
-				req->r_resend = true;
-				rc = 0;
-			} else {
-				__unregister_request(osdc, req);
-			}
-		}
-	}
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	return rc;
-}
-
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
-{
-	int rc;
-
-	rc = wait_for_completion_interruptible(&req->r_completion);
-	if (rc < 0) {
-		mutex_lock(&osdc->request_mutex);
-		__cancel_request(req);
-		__unregister_request(osdc, req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
-		return rc;
-	}
-
-	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
-	return req->r_result;
-}
-
-/*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
-
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
-
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
-
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
-	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
-}
-
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
-	int err;
-
-	dout("init\n");
-	osdc->client = client;
-	osdc->osdmap = NULL;
-	init_rwsem(&osdc->map_sem);
-	init_completion(&osdc->map_waiters);
-	osdc->last_requested_map = 0;
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
-	osdc->osds = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->osd_lru);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	osdc->num_requests = 0;
-	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-
-	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-
-	err = -ENOMEM;
-	osdc->req_mempool = mempool_create_kmalloc_pool(10,
-					sizeof(struct ceph_osd_request));
-	if (!osdc->req_mempool)
-		goto out;
-
-	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
-				"osd_op");
-	if (err < 0)
-		goto out_mempool;
-	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-				OSD_OPREPLY_FRONT_LEN, 10, true,
-				"osd_op_reply");
-	if (err < 0)
-		goto out_msgpool;
-	return 0;
-
-out_msgpool:
-	ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
-	mempool_destroy(osdc->req_mempool);
-out:
-	return err;
-}
-
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
-	cancel_delayed_work_sync(&osdc->timeout_work);
-	cancel_delayed_work_sync(&osdc->osds_timeout_work);
-	if (osdc->osdmap) {
-		ceph_osdmap_destroy(osdc->osdmap);
-		osdc->osdmap = NULL;
-	}
-	remove_old_osds(osdc, 1);
-	mempool_destroy(osdc->req_mempool);
-	ceph_msgpool_destroy(&osdc->msgpool_op);
-	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-			struct ceph_vino vino, struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			u32 truncate_seq, u64 truncate_size,
-			struct page **pages, int num_pages)
-{
-	struct ceph_osd_request *req;
-	int rc = 0;
-
-	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
-				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, 1);
-	if (!req)
-		return -ENOMEM;
-
-	/* it may be a short read due to an object boundary */
-	req->r_pages = pages;
-	num_pages = calc_pages_for(off, *plen);
-	req->r_num_pages = num_pages;
-
-	dout("readpages  final extent is %llu~%llu (%d pages)\n",
-	     off, *plen, req->r_num_pages);
-
-	rc = ceph_osdc_start_request(osdc, req, false);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_osdc_put_request(req);
-	dout("readpages result %d\n", rc);
-	return rc;
-}
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-			 struct ceph_file_layout *layout,
-			 struct ceph_snap_context *snapc,
-			 u64 off, u64 len,
-			 u32 truncate_seq, u64 truncate_size,
-			 struct timespec *mtime,
-			 struct page **pages, int num_pages,
-			 int flags, int do_sync, bool nofail)
-{
-	struct ceph_osd_request *req;
-	int rc = 0;
-
-	BUG_ON(vino.snap != CEPH_NOSNAP);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
-				    CEPH_OSD_OP_WRITE,
-				    flags | CEPH_OSD_FLAG_ONDISK |
-					    CEPH_OSD_FLAG_WRITE,
-				    snapc, do_sync,
-				    truncate_seq, truncate_size, mtime,
-				    nofail, 1);
-	if (!req)
-		return -ENOMEM;
-
-	/* it may be a short write due to an object boundary */
-	req->r_pages = pages;
-	req->r_num_pages = calc_pages_for(off, len);
-	dout("writepages %llu~%llu (%d pages)\n", off, len,
-	     req->r_num_pages);
-
-	rc = ceph_osdc_start_request(osdc, req, nofail);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_osdc_put_request(req);
-	if (rc == 0)
-		rc = len;
-	dout("writepages result %d\n", rc);
-	return rc;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
-	int type = le16_to_cpu(msg->hdr.type);
-
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
-	switch (type) {
-	case CEPH_MSG_OSD_MAP:
-		ceph_osdc_handle_map(osdc, msg);
-		break;
-	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg, con);
-		break;
-
-	default:
-		pr_err("received unknown message type %d %s\n", type,
-		       ceph_msg_type_name(type));
-	}
-out:
-	ceph_msg_put(msg);
-}
-
-/*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr,
-				  int *skip)
-{
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
-	struct ceph_osd_request *req;
-	int front = le32_to_cpu(hdr->front_len);
-	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
-
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (!req) {
-		*skip = 1;
-		m = NULL;
-		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-			osd->o_osd);
-		goto out;
-	}
-
-	if (req->r_con_filling_msg) {
-		dout("get_reply revoking msg %p from old con %p\n",
-		     req->r_reply, req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
-
-	if (front > req->r_reply->front.iov_len) {
-		pr_warning("get_reply front %d > preallocated %d\n",
-			   front, (int)req->r_reply->front.iov_len);
-		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-		if (!m)
-			goto out;
-		ceph_msg_put(req->r_reply);
-		req->r_reply = m;
-	}
-	m = ceph_msg_get(req->r_reply);
-
-	if (data_len > 0) {
-		unsigned data_off = le16_to_cpu(hdr->data_off);
-		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-		if (unlikely(req->r_num_pages < want)) {
-			pr_warning("tid %lld reply %d > expected %d pages\n",
-				   tid, want, m->nr_pages);
-			*skip = 1;
-			ceph_msg_put(m);
-			m = NULL;
-			goto out;
-		}
-		m->pages = req->r_pages;
-		m->nr_pages = req->r_num_pages;
-	}
-	*skip = 0;
-	req->r_con_filling_msg = ceph_con_get(con);
-	dout("get_reply tid %lld %p\n", tid, m);
-
-out:
-	mutex_unlock(&osdc->request_mutex);
-	return m;
-
-}
-
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr,
-				  int *skip)
-{
-	struct ceph_osd *osd = con->private;
-	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
-
-	switch (type) {
-	case CEPH_MSG_OSD_MAP:
-		return ceph_msg_new(type, front, GFP_NOFS);
-	case CEPH_MSG_OSD_OPREPLY:
-		return get_reply(con, hdr, skip);
-	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
-		*skip = 1;
-		return NULL;
-	}
-}
-
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	if (get_osd(osd))
-		return con;
-	return NULL;
-}
-
-static void put_osd_con(struct ceph_connection *con)
-{
-	struct ceph_osd *osd = con->private;
-	put_osd(osd);
-}
-
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
-	                  void **buf, int *len, int *proto,
-	                  void **reply_buf, int *reply_len, int force_new)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-	int ret = 0;
-
-	if (force_new && o->o_authorizer) {
-		ac->ops->destroy_authorizer(ac, o->o_authorizer);
-		o->o_authorizer = NULL;
-	}
-	if (o->o_authorizer == NULL) {
-		ret = ac->ops->create_authorizer(
-			ac, CEPH_ENTITY_TYPE_OSD,
-			&o->o_authorizer,
-			&o->o_authorizer_buf,
-			&o->o_authorizer_buf_len,
-			&o->o_authorizer_reply_buf,
-			&o->o_authorizer_reply_buf_len);
-		if (ret)
-		return ret;
-	}
-
-	*proto = ac->protocol;
-	*buf = o->o_authorizer_buf;
-	*len = o->o_authorizer_buf_len;
-	*reply_buf = o->o_authorizer_reply_buf;
-	*reply_len = o->o_authorizer_reply_buf_len;
-	return 0;
-}
-
-
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-
-static int invalidate_authorizer(struct ceph_connection *con)
-{
-	struct ceph_osd *o = con->private;
-	struct ceph_osd_client *osdc = o->o_osdc;
-	struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-	if (ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
-	return ceph_monc_validate_auth(&osdc->client->monc);
-}
-
-static const struct ceph_connection_operations osd_con_ops = {
-	.get = get_osd_con,
-	.put = put_osd_con,
-	.dispatch = dispatch,
-	.get_authorizer = get_authorizer,
-	.verify_authorizer_reply = verify_authorizer_reply,
-	.invalidate_authorizer = invalidate_authorizer,
-	.alloc_msg = alloc_msg,
-	.fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6..00000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-				     struct ceph_msg *);
-
-/* a given osd we're communicating with */
-struct ceph_osd {
-	atomic_t o_ref;
-	struct ceph_osd_client *o_osdc;
-	int o_osd;
-	int o_incarnation;
-	struct rb_node o_node;
-	struct ceph_connection o_con;
-	struct list_head o_requests;
-	struct list_head o_osd_lru;
-	struct ceph_authorizer *o_authorizer;
-	void *o_authorizer_buf, *o_authorizer_reply_buf;
-	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
-	unsigned long lru_ttl;
-	int o_marked_for_keepalive;
-	struct list_head o_keepalive_item;
-};
-
-/* an in-flight request */
-struct ceph_osd_request {
-	u64             r_tid;              /* unique for this client */
-	struct rb_node  r_node;
-	struct list_head r_req_lru_item;
-	struct list_head r_osd_item;
-	struct ceph_osd *r_osd;
-	struct ceph_pg   r_pgid;
-	int              r_pg_osds[CEPH_PG_MAX_SIZE];
-	int              r_num_pg_osds;
-
-	struct ceph_connection *r_con_filling_msg;
-
-	struct ceph_msg  *r_request, *r_reply;
-	int               r_result;
-	int               r_flags;     /* any additional flags for the osd */
-	u32               r_sent;      /* >0 if r_request is sending/sent */
-	int               r_got_reply;
-
-	struct ceph_osd_client *r_osdc;
-	struct kref       r_kref;
-	bool              r_mempool;
-	struct completion r_completion, r_safe_completion;
-	ceph_osdc_callback_t r_callback, r_safe_callback;
-	struct ceph_eversion r_reassert_version;
-	struct list_head  r_unsafe_item;
-
-	struct inode *r_inode;         	      /* for use by callbacks */
-
-	char              r_oid[40];          /* object name */
-	int               r_oid_len;
-	unsigned long     r_stamp;            /* send OR check time */
-	bool              r_resend;           /* msg send failed, needs retry */
-
-	struct ceph_file_layout r_file_layout;
-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
-	unsigned          r_num_pages;        /* size of page array (follows) */
-	struct page     **r_pages;            /* pages for data payload */
-	int               r_pages_from_pool;
-	int               r_own_pages;        /* if true, i own page list */
-};
-
-struct ceph_osd_client {
-	struct ceph_client     *client;
-
-	struct ceph_osdmap     *osdmap;       /* current map */
-	struct rw_semaphore    map_sem;
-	struct completion      map_waiters;
-	u64                    last_requested_map;
-
-	struct mutex           request_mutex;
-	struct rb_root         osds;          /* osds */
-	struct list_head       osd_lru;       /* idle osds */
-	u64                    timeout_tid;   /* tid of timeout triggering rq */
-	u64                    last_tid;      /* tid of last request */
-	struct rb_root         requests;      /* pending requests */
-	struct list_head       req_lru;	      /* pending requests lru */
-	int                    num_requests;
-	struct delayed_work    timeout_work;
-	struct delayed_work    osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry 	       *debugfs_file;
-#endif
-
-	mempool_t              *req_mempool;
-
-	struct ceph_msgpool	msgpool_op;
-	struct ceph_msgpool	msgpool_op_reply;
-};
-
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-			  struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-				   struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
-				 struct ceph_msg *msg);
-
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
-				      struct ceph_file_layout *layout,
-				      struct ceph_vino vino,
-				      u64 offset, u64 *len, int op, int flags,
-				      struct ceph_snap_context *snapc,
-				      int do_sync, u32 truncate_seq,
-				      u64 truncate_size,
-				      struct timespec *mtime,
-				      bool use_mempool, int num_reply);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-	kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-	kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req,
-				   bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-				  struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-			       struct ceph_vino vino,
-			       struct ceph_file_layout *layout,
-			       u64 off, u64 *plen,
-			       u32 truncate_seq, u64 truncate_size,
-			       struct page **pages, int nr_pages);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-				struct ceph_vino vino,
-				struct ceph_file_layout *layout,
-				struct ceph_snap_context *sc,
-				u64 off, u64 len,
-				u32 truncate_seq, u64 truncate_size,
-				struct timespec *mtime,
-				struct page **pages, int nr_pages,
-				int flags, int do_sync, bool nofail);
-
-#endif
-
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index 416d46adbf8..00000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1087 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-#include <asm/div64.h>
-
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
-	int flag = 0;
-
-	if (!len)
-		goto done;
-
-	*str = '\0';
-	if (state) {
-		if (state & CEPH_OSD_EXISTS) {
-			snprintf(str, len, "exists");
-			flag = 1;
-		}
-		if (state & CEPH_OSD_UP) {
-			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-				 "up");
-			flag = 1;
-		}
-	} else {
-		snprintf(str, len, "doesn't exist");
-	}
-done:
-	return str;
-}
-
-/* maps */
-
-static int calc_bits_of(unsigned t)
-{
-	int b = 0;
-	while (t) {
-		t = t >> 1;
-		b++;
-	}
-	return b;
-}
-
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
-	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-	pi->pgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-	pi->lpg_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-	pi->lpgp_num_mask =
-		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
-				       struct crush_bucket_uniform *b)
-{
-	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
-	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-	b->item_weight = ceph_decode_32(p);
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_list_bucket(void **p, void *end,
-				    struct crush_bucket_list *b)
-{
-	int j;
-	dout("crush_decode_list_bucket %p to %p\n", *p, end);
-	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->item_weights == NULL)
-		return -ENOMEM;
-	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->sum_weights == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-	for (j = 0; j < b->h.size; j++) {
-		b->item_weights[j] = ceph_decode_32(p);
-		b->sum_weights[j] = ceph_decode_32(p);
-	}
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_tree_bucket(void **p, void *end,
-				    struct crush_bucket_tree *b)
-{
-	int j;
-	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-	ceph_decode_32_safe(p, end, b->num_nodes, bad);
-	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
-	if (b->node_weights == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
-	for (j = 0; j < b->num_nodes; j++)
-		b->node_weights[j] = ceph_decode_32(p);
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static int crush_decode_straw_bucket(void **p, void *end,
-				     struct crush_bucket_straw *b)
-{
-	int j;
-	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
-	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->item_weights == NULL)
-		return -ENOMEM;
-	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-	if (b->straws == NULL)
-		return -ENOMEM;
-	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-	for (j = 0; j < b->h.size; j++) {
-		b->item_weights[j] = ceph_decode_32(p);
-		b->straws[j] = ceph_decode_32(p);
-	}
-	return 0;
-bad:
-	return -EINVAL;
-}
-
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
-	struct crush_map *c;
-	int err = -EINVAL;
-	int i, j;
-	void **p = &pbyval;
-	void *start = pbyval;
-	u32 magic;
-
-	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-	c = kzalloc(sizeof(*c), GFP_NOFS);
-	if (c == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	ceph_decode_need(p, end, 4*sizeof(u32), bad);
-	magic = ceph_decode_32(p);
-	if (magic != CRUSH_MAGIC) {
-		pr_err("crush_decode magic %x != current %x\n",
-		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
-		goto bad;
-	}
-	c->max_buckets = ceph_decode_32(p);
-	c->max_rules = ceph_decode_32(p);
-	c->max_devices = ceph_decode_32(p);
-
-	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-	if (c->device_parents == NULL)
-		goto badmem;
-	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-	if (c->bucket_parents == NULL)
-		goto badmem;
-
-	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
-	if (c->buckets == NULL)
-		goto badmem;
-	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
-	if (c->rules == NULL)
-		goto badmem;
-
-	/* buckets */
-	for (i = 0; i < c->max_buckets; i++) {
-		int size = 0;
-		u32 alg;
-		struct crush_bucket *b;
-
-		ceph_decode_32_safe(p, end, alg, bad);
-		if (alg == 0) {
-			c->buckets[i] = NULL;
-			continue;
-		}
-		dout("crush_decode bucket %d off %x %p to %p\n",
-		     i, (int)(*p-start), *p, end);
-
-		switch (alg) {
-		case CRUSH_BUCKET_UNIFORM:
-			size = sizeof(struct crush_bucket_uniform);
-			break;
-		case CRUSH_BUCKET_LIST:
-			size = sizeof(struct crush_bucket_list);
-			break;
-		case CRUSH_BUCKET_TREE:
-			size = sizeof(struct crush_bucket_tree);
-			break;
-		case CRUSH_BUCKET_STRAW:
-			size = sizeof(struct crush_bucket_straw);
-			break;
-		default:
-			err = -EINVAL;
-			goto bad;
-		}
-		BUG_ON(size == 0);
-		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
-		if (b == NULL)
-			goto badmem;
-
-		ceph_decode_need(p, end, 4*sizeof(u32), bad);
-		b->id = ceph_decode_32(p);
-		b->type = ceph_decode_16(p);
-		b->alg = ceph_decode_8(p);
-		b->hash = ceph_decode_8(p);
-		b->weight = ceph_decode_32(p);
-		b->size = ceph_decode_32(p);
-
-		dout("crush_decode bucket size %d off %x %p to %p\n",
-		     b->size, (int)(*p-start), *p, end);
-
-		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
-		if (b->items == NULL)
-			goto badmem;
-		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-		if (b->perm == NULL)
-			goto badmem;
-		b->perm_n = 0;
-
-		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
-		for (j = 0; j < b->size; j++)
-			b->items[j] = ceph_decode_32(p);
-
-		switch (b->alg) {
-		case CRUSH_BUCKET_UNIFORM:
-			err = crush_decode_uniform_bucket(p, end,
-				  (struct crush_bucket_uniform *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_LIST:
-			err = crush_decode_list_bucket(p, end,
-			       (struct crush_bucket_list *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_TREE:
-			err = crush_decode_tree_bucket(p, end,
-				(struct crush_bucket_tree *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		case CRUSH_BUCKET_STRAW:
-			err = crush_decode_straw_bucket(p, end,
-				(struct crush_bucket_straw *)b);
-			if (err < 0)
-				goto bad;
-			break;
-		}
-	}
-
-	/* rules */
-	dout("rule vec is %p\n", c->rules);
-	for (i = 0; i < c->max_rules; i++) {
-		u32 yes;
-		struct crush_rule *r;
-
-		ceph_decode_32_safe(p, end, yes, bad);
-		if (!yes) {
-			dout("crush_decode NO rule %d off %x %p to %p\n",
-			     i, (int)(*p-start), *p, end);
-			c->rules[i] = NULL;
-			continue;
-		}
-
-		dout("crush_decode rule %d off %x %p to %p\n",
-		     i, (int)(*p-start), *p, end);
-
-		/* len */
-		ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
-		err = -EINVAL;
-		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
-			goto bad;
-#endif
-		r = c->rules[i] = kmalloc(sizeof(*r) +
-					  yes*sizeof(struct crush_rule_step),
-					  GFP_NOFS);
-		if (r == NULL)
-			goto badmem;
-		dout(" rule %d is at %p\n", i, r);
-		r->len = yes;
-		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
-		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
-		for (j = 0; j < r->len; j++) {
-			r->steps[j].op = ceph_decode_32(p);
-			r->steps[j].arg1 = ceph_decode_32(p);
-			r->steps[j].arg2 = ceph_decode_32(p);
-		}
-	}
-
-	/* ignore trailing name maps. */
-
-	dout("crush_decode success\n");
-	return c;
-
-badmem:
-	err = -ENOMEM;
-bad:
-	dout("crush_decode fail %d\n", err);
-	crush_destroy(c);
-	return ERR_PTR(err);
-}
-
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-	u64 a = *(u64 *)&l;
-	u64 b = *(u64 *)&r;
-
-	if (a < b)
-		return -1;
-	if (a > b)
-		return 1;
-	return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-			       struct rb_root *root)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_mapping *pg = NULL;
-	int c;
-
-	while (*p) {
-		parent = *p;
-		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = pgid_cmp(new->pgid, pg->pgid);
-		if (c < 0)
-			p = &(*p)->rb_left;
-		else if (c > 0)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg pgid)
-{
-	struct rb_node *n = root->rb_node;
-	struct ceph_pg_mapping *pg;
-	int c;
-
-	while (n) {
-		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
-		if (c < 0)
-			n = n->rb_left;
-		else if (c > 0)
-			n = n->rb_right;
-		else
-			return pg;
-	}
-	return NULL;
-}
-
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_pool_info *pi = NULL;
-
-	while (*p) {
-		parent = *p;
-		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-		if (new->id < pi->id)
-			p = &(*p)->rb_left;
-		else if (new->id > pi->id)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
-	struct ceph_pg_pool_info *pi;
-	struct rb_node *n = root->rb_node;
-
-	while (n) {
-		pi = rb_entry(n, struct ceph_pg_pool_info, node);
-		if (id < pi->id)
-			n = n->rb_left;
-		else if (id > pi->id)
-			n = n->rb_right;
-		else
-			return pi;
-	}
-	return NULL;
-}
-
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
-	rb_erase(&pi->node, root);
-	kfree(pi->name);
-	kfree(pi);
-}
-
-void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
-{
-	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-	calc_pg_masks(pi);
-	*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
-	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
-}
-
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
-	struct ceph_pg_pool_info *pi;
-	u32 num, len, pool;
-
-	ceph_decode_32_safe(p, end, num, bad);
-	dout(" %d pool names\n", num);
-	while (num--) {
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_32_safe(p, end, len, bad);
-		dout("  pool %d len %d\n", pool, len);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi) {
-			kfree(pi->name);
-			pi->name = kmalloc(len + 1, GFP_NOFS);
-			if (pi->name) {
-				memcpy(pi->name, *p, len);
-				pi->name[len] = '\0';
-				dout("  name is %s\n", pi->name);
-			}
-		}
-		*p += len;
-	}
-	return 0;
-
-bad:
-	return -EINVAL;
-}
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-	dout("osdmap_destroy %p\n", map);
-	if (map->crush)
-		crush_destroy(map->crush);
-	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-		struct ceph_pg_mapping *pg =
-			rb_entry(rb_first(&map->pg_temp),
-				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->pg_temp);
-		kfree(pg);
-	}
-	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-		struct ceph_pg_pool_info *pi =
-			rb_entry(rb_first(&map->pg_pools),
-				 struct ceph_pg_pool_info, node);
-		__remove_pg_pool(&map->pg_pools, pi);
-	}
-	kfree(map->osd_state);
-	kfree(map->osd_weight);
-	kfree(map->osd_addr);
-	kfree(map);
-}
-
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-	u8 *state;
-	struct ceph_entity_addr *addr;
-	u32 *weight;
-
-	state = kcalloc(max, sizeof(*state), GFP_NOFS);
-	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-	if (state == NULL || addr == NULL || weight == NULL) {
-		kfree(state);
-		kfree(addr);
-		kfree(weight);
-		return -ENOMEM;
-	}
-
-	/* copy old? */
-	if (map->osd_state) {
-		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-		kfree(map->osd_state);
-		kfree(map->osd_addr);
-		kfree(map->osd_weight);
-	}
-
-	map->osd_state = state;
-	map->osd_weight = weight;
-	map->osd_addr = addr;
-	map->max_osd = max;
-	return 0;
-}
-
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
-	struct ceph_osdmap *map;
-	u16 version;
-	u32 len, max, i;
-	u8 ev;
-	int err = -EINVAL;
-	void *start = *p;
-	struct ceph_pg_pool_info *pi;
-
-	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (map == NULL)
-		return ERR_PTR(-ENOMEM);
-	map->pg_temp = RB_ROOT;
-
-	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_VERSION) {
-		pr_warning("got unknown v %d > %d of osdmap\n", version,
-			   CEPH_OSDMAP_VERSION);
-		goto bad;
-	}
-
-	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
-	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-	map->epoch = ceph_decode_32(p);
-	ceph_decode_copy(p, &map->created, sizeof(map->created));
-	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-
-	ceph_decode_32_safe(p, end, max, bad);
-	while (max--) {
-		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-		pi = kzalloc(sizeof(*pi), GFP_NOFS);
-		if (!pi)
-			goto bad;
-		pi->id = ceph_decode_32(p);
-		ev = ceph_decode_8(p); /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			kfree(pi);
-			goto bad;
-		}
-		__decode_pool(p, pi);
-		__insert_pg_pool(&map->pg_pools, pi);
-	}
-
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
-
-	ceph_decode_32_safe(p, end, map->pool_max, bad);
-
-	ceph_decode_32_safe(p, end, map->flags, bad);
-
-	max = ceph_decode_32(p);
-
-	/* (re)alloc osd arrays */
-	err = osdmap_set_max_osd(map, max);
-	if (err < 0)
-		goto bad;
-	dout("osdmap_decode max_osd = %d\n", map->max_osd);
-
-	/* osds */
-	err = -EINVAL;
-	ceph_decode_need(p, end, 3*sizeof(u32) +
-			 map->max_osd*(1 + sizeof(*map->osd_weight) +
-				       sizeof(*map->osd_addr)), bad);
-	*p += 4; /* skip length field (should match max) */
-	ceph_decode_copy(p, map->osd_state, map->max_osd);
-
-	*p += 4; /* skip length field (should match max) */
-	for (i = 0; i < map->max_osd; i++)
-		map->osd_weight[i] = ceph_decode_32(p);
-
-	*p += 4; /* skip length field (should match max) */
-	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-	for (i = 0; i < map->max_osd; i++)
-		ceph_decode_addr(&map->osd_addr[i]);
-
-	/* pg_temp */
-	ceph_decode_32_safe(p, end, len, bad);
-	for (i = 0; i < len; i++) {
-		int n, j;
-		struct ceph_pg pgid;
-		struct ceph_pg_mapping *pg;
-
-		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
-		n = ceph_decode_32(p);
-		ceph_decode_need(p, end, n * sizeof(u32), bad);
-		err = -ENOMEM;
-		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-		if (!pg)
-			goto bad;
-		pg->pgid = pgid;
-		pg->len = n;
-		for (j = 0; j < n; j++)
-			pg->osds[j] = ceph_decode_32(p);
-
-		err = __insert_pg_mapping(pg, &map->pg_temp);
-		if (err)
-			goto bad;
-		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
-	}
-
-	/* crush */
-	ceph_decode_32_safe(p, end, len, bad);
-	dout("osdmap_decode crush len %d from off 0x%x\n", len,
-	     (int)(*p - start));
-	ceph_decode_need(p, end, len, bad);
-	map->crush = crush_decode(*p, end);
-	*p += len;
-	if (IS_ERR(map->crush)) {
-		err = PTR_ERR(map->crush);
-		map->crush = NULL;
-		goto bad;
-	}
-
-	/* ignore the rest of the map */
-	*p = end;
-
-	dout("osdmap_decode done %p %p\n", *p, end);
-	return map;
-
-bad:
-	dout("osdmap_decode fail\n");
-	ceph_osdmap_destroy(map);
-	return ERR_PTR(err);
-}
-
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					     struct ceph_osdmap *map,
-					     struct ceph_messenger *msgr)
-{
-	struct crush_map *newcrush = NULL;
-	struct ceph_fsid fsid;
-	u32 epoch = 0;
-	struct ceph_timespec modified;
-	u32 len, pool;
-	__s32 new_pool_max, new_flags, max;
-	void *start = *p;
-	int err = -EINVAL;
-	u16 version;
-	struct rb_node *rbp;
-
-	ceph_decode_16_safe(p, end, version, bad);
-	if (version > CEPH_OSDMAP_INC_VERSION) {
-		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-			   CEPH_OSDMAP_INC_VERSION);
-		goto bad;
-	}
-
-	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-			 bad);
-	ceph_decode_copy(p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(p);
-	BUG_ON(epoch != map->epoch+1);
-	ceph_decode_copy(p, &modified, sizeof(modified));
-	new_pool_max = ceph_decode_32(p);
-	new_flags = ceph_decode_32(p);
-
-	/* full map? */
-	ceph_decode_32_safe(p, end, len, bad);
-	if (len > 0) {
-		dout("apply_incremental full map len %d, %p to %p\n",
-		     len, *p, end);
-		return osdmap_decode(p, min(*p+len, end));
-	}
-
-	/* new crush? */
-	ceph_decode_32_safe(p, end, len, bad);
-	if (len > 0) {
-		dout("apply_incremental new crush map len %d, %p to %p\n",
-		     len, *p, end);
-		newcrush = crush_decode(*p, min(*p+len, end));
-		if (IS_ERR(newcrush))
-			return ERR_CAST(newcrush);
-		*p += len;
-	}
-
-	/* new flags? */
-	if (new_flags >= 0)
-		map->flags = new_flags;
-	if (new_pool_max >= 0)
-		map->pool_max = new_pool_max;
-
-	ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
-	/* new max? */
-	max = ceph_decode_32(p);
-	if (max >= 0) {
-		err = osdmap_set_max_osd(map, max);
-		if (err < 0)
-			goto bad;
-	}
-
-	map->epoch++;
-	map->modified = map->modified;
-	if (newcrush) {
-		if (map->crush)
-			crush_destroy(map->crush);
-		map->crush = newcrush;
-		newcrush = NULL;
-	}
-
-	/* new_pool */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		__u8 ev;
-		struct ceph_pg_pool_info *pi;
-
-		ceph_decode_32_safe(p, end, pool, bad);
-		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-		ev = ceph_decode_8(p);  /* encoding version */
-		if (ev > CEPH_PG_POOL_VERSION) {
-			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-				   ev, CEPH_PG_POOL_VERSION);
-			goto bad;
-		}
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (!pi) {
-			pi = kzalloc(sizeof(*pi), GFP_NOFS);
-			if (!pi) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pi->id = pool;
-			__insert_pg_pool(&map->pg_pools, pi);
-		}
-		__decode_pool(p, pi);
-	}
-	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-		goto bad;
-
-	/* old_pool */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		struct ceph_pg_pool_info *pi;
-
-		ceph_decode_32_safe(p, end, pool, bad);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi)
-			__remove_pg_pool(&map->pg_pools, pi);
-	}
-
-	/* new_up */
-	err = -EINVAL;
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		struct ceph_entity_addr addr;
-		ceph_decode_32_safe(p, end, osd, bad);
-		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-		ceph_decode_addr(&addr);
-		pr_info("osd%d up\n", osd);
-		BUG_ON(osd >= map->max_osd);
-		map->osd_state[osd] |= CEPH_OSD_UP;
-		map->osd_addr[osd] = addr;
-	}
-
-	/* new_down */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd;
-		ceph_decode_32_safe(p, end, osd, bad);
-		(*p)++;  /* clean flag */
-		pr_info("osd%d down\n", osd);
-		if (osd < map->max_osd)
-			map->osd_state[osd] &= ~CEPH_OSD_UP;
-	}
-
-	/* new_weight */
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		u32 osd, off;
-		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		osd = ceph_decode_32(p);
-		off = ceph_decode_32(p);
-		pr_info("osd%d weight 0x%x %s\n", osd, off,
-		     off == CEPH_OSD_IN ? "(in)" :
-		     (off == CEPH_OSD_OUT ? "(out)" : ""));
-		if (osd < map->max_osd)
-			map->osd_weight[osd] = off;
-	}
-
-	/* new_pg_temp */
-	rbp = rb_first(&map->pg_temp);
-	ceph_decode_32_safe(p, end, len, bad);
-	while (len--) {
-		struct ceph_pg_mapping *pg;
-		int j;
-		struct ceph_pg pgid;
-		u32 pglen;
-		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		ceph_decode_copy(p, &pgid, sizeof(pgid));
-		pglen = ceph_decode_32(p);
-
-		/* remove any? */
-		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
-						node)->pgid, pgid) <= 0) {
-			struct ceph_pg_mapping *cur =
-				rb_entry(rbp, struct ceph_pg_mapping, node);
-			
-			rbp = rb_next(rbp);
-			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-			rb_erase(&cur->node, &map->pg_temp);
-			kfree(cur);
-		}
-
-		if (pglen) {
-			/* insert */
-			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-			if (!pg) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			pg->pgid = pgid;
-			pg->len = pglen;
-			for (j = 0; j < pglen; j++)
-				pg->osds[j] = ceph_decode_32(p);
-			err = __insert_pg_mapping(pg, &map->pg_temp);
-			if (err) {
-				kfree(pg);
-				goto bad;
-			}
-			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-			     pglen);
-		}
-	}
-	while (rbp) {
-		struct ceph_pg_mapping *cur =
-			rb_entry(rbp, struct ceph_pg_mapping, node);
-
-		rbp = rb_next(rbp);
-		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-		rb_erase(&cur->node, &map->pg_temp);
-		kfree(cur);
-	}
-
-	/* ignore the rest */
-	*p = end;
-	return map;
-
-bad:
-	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-	       epoch, (int)(*p - start), *p, start, end);
-	print_hex_dump(KERN_DEBUG, "osdmap: ",
-		       DUMP_PREFIX_OFFSET, 16, 1,
-		       start, end - start, true);
-	if (newcrush)
-		crush_destroy(newcrush);
-	return ERR_PTR(err);
-}
-
-
-
-
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-				   u64 off, u64 *plen,
-				   u64 *ono,
-				   u64 *oxoff, u64 *oxlen)
-{
-	u32 osize = le32_to_cpu(layout->fl_object_size);
-	u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	u32 sc = le32_to_cpu(layout->fl_stripe_count);
-	u32 bl, stripeno, stripepos, objsetno;
-	u32 su_per_object;
-	u64 t, su_offset;
-
-	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
-	     osize, su);
-	su_per_object = osize / su;
-	dout("osize %u / su %u = su_per_object %u\n", osize, su,
-	     su_per_object);
-
-	BUG_ON((su & ~PAGE_MASK) != 0);
-	/* bl = *off / su; */
-	t = off;
-	do_div(t, su);
-	bl = t;
-	dout("off %llu / su %u = bl %u\n", off, su, bl);
-
-	stripeno = bl / sc;
-	stripepos = bl % sc;
-	objsetno = stripeno / su_per_object;
-
-	*ono = objsetno * sc + stripepos;
-	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-
-	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
-	t = off;
-	su_offset = do_div(t, su);
-	*oxoff = su_offset + (stripeno % su_per_object) * su;
-
-	/*
-	 * Calculate the length of the extent being written to the selected
-	 * object. This is the minimum of the full length requested (plen) or
-	 * the remainder of the current stripe being written to.
-	 */
-	*oxlen = min_t(u64, *plen, su - su_offset);
-	*plen = *oxlen;
-
-	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
-			    const char *oid,
-			    struct ceph_file_layout *fl,
-			    struct ceph_osdmap *osdmap)
-{
-	unsigned num, num_mask;
-	struct ceph_pg pgid;
-	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
-	int poolid = le32_to_cpu(fl->fl_pg_pool);
-	struct ceph_pg_pool_info *pool;
-	unsigned ps;
-
-	BUG_ON(!osdmap);
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-	if (!pool)
-		return -EIO;
-	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-	if (preferred >= 0) {
-		ps += preferred;
-		num = le32_to_cpu(pool->v.lpg_num);
-		num_mask = pool->lpg_num_mask;
-	} else {
-		num = le32_to_cpu(pool->v.pg_num);
-		num_mask = pool->pg_num_mask;
-	}
-
-	pgid.ps = cpu_to_le16(ps);
-	pgid.preferred = cpu_to_le16(preferred);
-	pgid.pool = fl->fl_pg_pool;
-	if (preferred >= 0)
-		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-		     (int)preferred);
-	else
-		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-
-	ol->ol_pgid = pgid;
-	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-	return 0;
-}
-
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *osds, int *num)
-{
-	struct ceph_pg_mapping *pg;
-	struct ceph_pg_pool_info *pool;
-	int ruleno;
-	unsigned poolid, ps, pps;
-	int preferred;
-
-	/* pg_temp? */
-	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-	if (pg) {
-		*num = pg->len;
-		return pg->osds;
-	}
-
-	/* crush */
-	poolid = le32_to_cpu(pgid.pool);
-	ps = le16_to_cpu(pgid.ps);
-	preferred = (s16)le16_to_cpu(pgid.preferred);
-
-	/* don't forcefeed bad device ids to crush */
-	if (preferred >= osdmap->max_osd ||
-	    preferred >= osdmap->crush->max_devices)
-		preferred = -1;
-
-	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-	if (!pool)
-		return NULL;
-	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-				 pool->v.type, pool->v.size);
-	if (ruleno < 0) {
-		pr_err("no crush rule pool %d type %d size %d\n",
-		       poolid, pool->v.type, pool->v.size);
-		return NULL;
-	}
-
-	if (preferred >= 0)
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.lpgp_num),
-				      pool->lpgp_num_mask);
-	else
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.pgp_num),
-				      pool->pgp_num_mask);
-	pps += poolid;
-	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-			     min_t(int, pool->v.size, *num),
-			     preferred, osdmap->osd_weight);
-	return osds;
-}
-
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			int *acting)
-{
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, o, num = CEPH_PG_MAX_SIZE;
-
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
-
-	/* primary is first up osd */
-	o = 0;
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			acting[o++] = osds[i];
-	return o;
-}
-
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
-	int rawosds[CEPH_PG_MAX_SIZE], *osds;
-	int i, num = CEPH_PG_MAX_SIZE;
-
-	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-	if (!osds)
-		return -1;
-
-	/* primary is first up osd */
-	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i]))
-			return osds[i];
-	return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510..00000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds.  That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
-	struct rb_node node;
-	int id;
-	struct ceph_pg_pool v;
-	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-	char *name;
-};
-
-struct ceph_pg_mapping {
-	struct rb_node node;
-	struct ceph_pg pgid;
-	int len;
-	int osds[];
-};
-
-struct ceph_osdmap {
-	struct ceph_fsid fsid;
-	u32 epoch;
-	u32 mkfs_epoch;
-	struct ceph_timespec created, modified;
-
-	u32 flags;         /* CEPH_OSDMAP_* */
-
-	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-	u8 *osd_state;     /* CEPH_OSD_* */
-	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
-	struct ceph_entity_addr *osd_addr;
-
-	struct rb_root pg_temp;
-	struct rb_root pg_pools;
-	u32 pool_max;
-
-	/* the CRUSH map specifies the mapping of placement groups to
-	 * the list of osds that store+replicate them. */
-	struct crush_map *crush;
-};
-
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-	((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-	((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
-	((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_stripe_unit) *
-		le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_object_size) *
-		le32_to_cpu(l->fl_stripe_count);
-}
-
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
-	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-	return map && (map->flags & flag);
-}
-
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
-						     int osd)
-{
-	if (osd >= map->max_osd)
-		return NULL;
-	return &map->osd_addr[osd];
-}
-
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-					    struct ceph_osdmap *map,
-					    struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					  u64 off, u64 *plen,
-					  u64 *bno, u64 *oxoff, u64 *oxlen);
-
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
-				   const char *oid,
-				   struct ceph_file_layout *fl,
-				   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-			       int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
-
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index b6859f47d36..00000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,55 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-	if (pl->mapped_tail)
-		kunmap(pl->mapped_tail);
-	while (!list_empty(&pl->head)) {
-		struct page *page = list_first_entry(&pl->head, struct page,
-						     lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-	struct page *page = __page_cache_alloc(GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
-	pl->room += PAGE_SIZE;
-	list_add_tail(&page->lru, &pl->head);
-	if (pl->mapped_tail)
-		kunmap(pl->mapped_tail);
-	pl->mapped_tail = kmap(page);
-	return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-	while (pl->room < len) {
-		size_t bit = pl->room;
-		int ret;
-
-		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-		       buf, bit);
-		pl->length += bit;
-		pl->room -= bit;
-		buf += bit;
-		len -= bit;
-		ret = ceph_pagelist_addpage(pl);
-		if (ret)
-			return ret;
-	}
-
-	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-	pl->length += len;
-	pl->room -= len;
-	return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e108..00000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-
-#include <linux/list.h>
-
-struct ceph_pagelist {
-	struct list_head head;
-	void *mapped_tail;
-	size_t length;
-	size_t room;
-};
-
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
-	INIT_LIST_HEAD(&pl->head);
-	pl->mapped_tail = NULL;
-	pl->length = 0;
-	pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
-
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
-	__le64 ev = cpu_to_le64(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
-	__le32 ev = cpu_to_le32(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
-	__le16 ev = cpu_to_le16(v);
-	return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
-	return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
-					      char *s, size_t len)
-{
-	int ret = ceph_pagelist_encode_32(pl, len);
-	if (ret)
-		return ret;
-	if (len)
-		return ceph_pagelist_append(pl, s, len);
-	return 0;
-}
-
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 8fcc023056c..00000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,396 +0,0 @@
-#ifndef __RADOS_H
-#define __RADOS_H
-
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-
-#include "msgr.h"
-
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     5
-
-/*
- * fs id
- */
-struct ceph_fsid {
-	unsigned char fsid[16];
-};
-
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
-				    const struct ceph_fsid *b)
-{
-	return memcmp(a, b, sizeof(*a));
-}
-
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
-
-struct ceph_timespec {
-	__le32 tv_sec;
-	__le32 tv_nsec;
-} __attribute__ ((packed));
-
-
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH     1
-#define CEPH_OBJECT_LAYOUT_LINEAR   2
-#define CEPH_OBJECT_LAYOUT_HASHINO  3
-
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH  0
-#define CEPH_PG_LAYOUT_HASH   1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
-
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
-	__le16 preferred; /* preferred primary osd */
-	__le16 ps;        /* placement seed */
-	__le32 pool;      /* object pool */
-} __attribute__ ((packed));
-
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- *  pg_num -- base number of pseudorandomly placed pgs
- *
- *  pgp_num -- effective number when calculating pg placement.  this
- * is used for pg_num increases.  new pgs result in data being "split"
- * into new pgs.  for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split.  only _then_ are the new
- * pgs placed independently.
- *
- *  lpg_num -- localized pg count (per device).  replicas are randomly
- * selected.
- *
- *  lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-	__u8 type;                /* CEPH_PG_TYPE_* */
-	__u8 size;                /* number of osds in each pg */
-	__u8 crush_ruleset;       /* crush placement rule */
-	__u8 object_hash;         /* hash mapping object name to ps */
-	__le32 pg_num, pgp_num;   /* number of pg's */
-	__le32 lpg_num, lpgp_num; /* number of localized pg's */
-	__le32 last_change;       /* most recent epoch changed */
-	__le64 snap_seq;          /* seq for per-pool snapshot */
-	__le32 snap_epoch;        /* epoch of last snap */
-	__le32 num_snaps;
-	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-	__le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
-
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time.  b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
-	if ((x & bmask) < b)
-		return x & bmask;
-	else
-		return x & (bmask >> 1);
-}
-
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
-	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
-	__le32 ol_stripe_unit;    /* for per-object parity, if any */
-} __attribute__ ((packed));
-
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
-	__le32 epoch;
-	__le64 version;
-} __attribute__ ((packed));
-
-/*
- * osd map bits
- */
-
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
-
-/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN  0x10000
-#define CEPH_OSD_OUT 0
-
-
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
-
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE       0xf000
-#define CEPH_OSD_OP_MODE_RD    0x1000
-#define CEPH_OSD_OP_MODE_WR    0x2000
-#define CEPH_OSD_OP_MODE_RMW   0x3000
-#define CEPH_OSD_OP_MODE_SUB   0x4000
-
-#define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
-#define CEPH_OSD_OP_TYPE_DATA  0x0200
-#define CEPH_OSD_OP_TYPE_ATTR  0x0300
-#define CEPH_OSD_OP_TYPE_EXEC  0x0400
-#define CEPH_OSD_OP_TYPE_PG    0x0500
-
-enum {
-	/** data **/
-	/* read */
-	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
-	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-
-	/* fancy read */
-	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-
-	/* write */
-	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
-	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
-	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
-	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
-	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
-	/* fancy write */
-	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
-	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
-	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
-	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
-	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
-	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
-	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-
-	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
-
-	/** attrs **/
-	/* read */
-	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
-	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-
-	/* write */
-	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
-	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
-	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
-	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
-	/** subop **/
-	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
-	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
-	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
-	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
-	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
-
-	/** lock **/
-	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
-	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
-	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
-	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
-	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
-	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-
-	/** exec **/
-	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-
-	/** pg **/
-	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-
-static inline int ceph_osd_op_type_lock(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
-	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-
-static inline int ceph_osd_op_mode_subop(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
-	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM  'r'
-
-extern const char *ceph_osd_op_name(int op);
-
-
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
-	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-	CEPH_OSD_FLAG_READ = 16,        /* op may read */
-	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-	CEPH_OSD_FLAG_BALANCE_READS = 256,
-	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-};
-
-enum {
-	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
-};
-
-#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-
-/* xattr comparison */
-enum {
-	CEPH_OSD_CMPXATTR_OP_NOP = 0,
-	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
-	CEPH_OSD_CMPXATTR_OP_NE  = 2,
-	CEPH_OSD_CMPXATTR_OP_GT  = 3,
-	CEPH_OSD_CMPXATTR_OP_GTE = 4,
-	CEPH_OSD_CMPXATTR_OP_LT  = 5,
-	CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-
-enum {
-	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
-	CEPH_OSD_CMPXATTR_MODE_U64    = 2
-};
-
-/*
- * an individual object operation.  each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
-	__le16 op;           /* CEPH_OSD_OP_* */
-	__le32 flags;        /* CEPH_OSD_FLAG_* */
-	union {
-		struct {
-			__le64 offset, length;
-			__le64 truncate_size;
-			__le32 truncate_seq;
-		} __attribute__ ((packed)) extent;
-		struct {
-			__le32 name_len;
-			__le32 value_len;
-			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-		} __attribute__ ((packed)) xattr;
-		struct {
-			__u8 class_len;
-			__u8 method_len;
-			__u8 argc;
-			__le32 indata_len;
-		} __attribute__ ((packed)) cls;
-		struct {
-			__le64 cookie, count;
-		} __attribute__ ((packed)) pgls;
-	};
-	__le32 payload_len;
-} __attribute__ ((packed));
-
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-	__le32 client_inc;                 /* client incarnation */
-	struct ceph_object_layout layout;  /* pgid */
-	__le32 osdmap_epoch;               /* client's osdmap epoch */
-
-	__le32 flags;
-
-	struct ceph_timespec mtime;        /* for mutations only */
-	struct ceph_eversion reassert_version; /* if we are replaying op */
-
-	__le32 object_len;     /* length of object name */
-
-	__le64 snapid;         /* snapid to read */
-	__le64 snap_seq;       /* writer's snap context */
-	__le32 num_snaps;
-
-	__le16 num_ops;
-	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
-	__le32 client_inc;                /* client incarnation */
-	__le32 flags;
-	struct ceph_object_layout layout;
-	__le32 osdmap_epoch;
-	struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
-	__le32 result;                    /* result code */
-
-	__le32 object_len;                /* length of object name */
-	__le32 num_ops;
-	struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-
-
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badb..39c243acd06 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/sort.h>
 #include <linux/slab.h>
 
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 /*
  * Snapshots in ceph are driven in large part by cooperation from the
@@ -119,6 +121,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 	INIT_LIST_HEAD(&realm->children);
 	INIT_LIST_HEAD(&realm->child_item);
 	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->dirty_item);
 	INIT_LIST_HEAD(&realm->inodes_with_caps);
 	spin_lock_init(&realm->inodes_with_caps_lock);
 	__insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +438,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
-	int used;
+	int used, dirty;
 
 	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
 	if (!capsnap) {
@@ -445,6 +448,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 
 	spin_lock(&inode->i_lock);
 	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
 	if (__ceph_have_pending_cap_snap(ci)) {
 		/* there is no point in queuing multiple "pending" cap_snaps,
 		   as no new writes are allowed to start when pending, so any
@@ -452,27 +456,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
-	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
+		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+		     capsnap, snapc);
 		igrab(inode);
-
+		
 		atomic_set(&capsnap->nref, 1);
 		capsnap->ci = ci;
 		INIT_LIST_HEAD(&capsnap->ci_item);
 		INIT_LIST_HEAD(&capsnap->flushing_item);
 
-		capsnap->follows = snapc->seq - 1;
+		capsnap->follows = snapc->seq;
 		capsnap->issued = __ceph_caps_issued(ci, NULL);
-		capsnap->dirty = __ceph_caps_dirty(ci);
+		capsnap->dirty = dirty;
 
 		capsnap->mode = inode->i_mode;
 		capsnap->uid = inode->i_uid;
 		capsnap->gid = inode->i_gid;
 
-		/* fixme? */
-		capsnap->xattr_blob = NULL;
-		capsnap->xattr_len = 0;
+		if (dirty & CEPH_CAP_XATTR_EXCL) {
+			__ceph_build_xattrs_blob(ci);
+			capsnap->xattr_blob =
+				ceph_buffer_get(ci->i_xattrs.blob);
+			capsnap->xattr_version = ci->i_xattrs.version;
+		} else {
+			capsnap->xattr_blob = NULL;
+			capsnap->xattr_version = 0;
+		}
 
 		/* dirty page count moved from _head to this cap_snap;
 		   all subsequent writes page dirties occur _after_ this
@@ -480,7 +494,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
 		ci->i_wrbuffer_ref_head = 0;
 		capsnap->context = snapc;
-		ci->i_head_snapc = NULL;
+		ci->i_head_snapc =
+			ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" new snapc is %p\n", ci->i_head_snapc);
 		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
 
 		if (used & CEPH_CAP_FILE_WR) {
@@ -512,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;
@@ -539,6 +555,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	return 1;  /* caller may want to ceph_flush_snaps */
 }
 
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+	struct ceph_inode_info *ci;
+	struct inode *lastinode = NULL;
+	struct ceph_snap_realm *child;
+
+	dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+	spin_lock(&realm->inodes_with_caps_lock);
+	list_for_each_entry(ci, &realm->inodes_with_caps,
+			    i_snap_realm_item) {
+		struct inode *inode = igrab(&ci->vfs_inode);
+		if (!inode)
+			continue;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		if (lastinode)
+			iput(lastinode);
+		lastinode = inode;
+		ceph_queue_cap_snap(ci);
+		spin_lock(&realm->inodes_with_caps_lock);
+	}
+	spin_unlock(&realm->inodes_with_caps_lock);
+	if (lastinode)
+		iput(lastinode);
+
+	dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+	list_for_each_entry(child, &realm->children, child_item)
+		queue_realm_cap_snaps(child);
+
+	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
 
 /*
  * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
@@ -556,6 +607,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	struct ceph_snap_realm *realm;
 	int invalidate = 0;
 	int err = -ENOMEM;
+	LIST_HEAD(dirty_realms);
 
 	dout("update_snap_trace deletion=%d\n", deletion);
 more:
@@ -578,45 +630,6 @@ more:
 		}
 	}
 
-	if (le64_to_cpu(ri->seq) > realm->seq) {
-		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
-		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
-		/*
-		 * if the realm seq has changed, queue a cap_snap for every
-		 * inode with open caps.  we do this _before_ we update
-		 * the realm info so that we prepare for writeback under the
-		 * _previous_ snap context.
-		 *
-		 * ...unless it's a snap deletion!
-		 */
-		if (!deletion) {
-			struct ceph_inode_info *ci;
-			struct inode *lastinode = NULL;
-
-			spin_lock(&realm->inodes_with_caps_lock);
-			list_for_each_entry(ci, &realm->inodes_with_caps,
-					    i_snap_realm_item) {
-				struct inode *inode = igrab(&ci->vfs_inode);
-				if (!inode)
-					continue;
-				spin_unlock(&realm->inodes_with_caps_lock);
-				if (lastinode)
-					iput(lastinode);
-				lastinode = inode;
-				ceph_queue_cap_snap(ci);
-				spin_lock(&realm->inodes_with_caps_lock);
-			}
-			spin_unlock(&realm->inodes_with_caps_lock);
-			if (lastinode)
-				iput(lastinode);
-			dout("update_snap_trace cap_snaps queued\n");
-		}
-
-	} else {
-		dout("update_snap_trace %llx %p seq %lld unchanged\n",
-		     realm->ino, realm, realm->seq);
-	}
-
 	/* ensure the parent is correct */
 	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
 	if (err < 0)
@@ -624,6 +637,8 @@ more:
 	invalidate += err;
 
 	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
 		/* update realm parameters, snap lists */
 		realm->seq = le64_to_cpu(ri->seq);
 		realm->created = le64_to_cpu(ri->created);
@@ -641,9 +656,17 @@ more:
 		if (err < 0)
 			goto fail;
 
+		/* queue realm for cap_snap creation */
+		list_add(&realm->dirty_item, &dirty_realms);
+
 		invalidate = 1;
 	} else if (!realm->cached_context) {
+		dout("update_snap_trace %llx %p seq %lld new\n",
+		     realm->ino, realm, realm->seq);
 		invalidate = 1;
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
 	}
 
 	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +679,14 @@ more:
 	if (invalidate)
 		rebuild_snap_realms(realm);
 
+	/*
+	 * queue cap snaps _after_ we've built the new snap contexts,
+	 * so that i_head_snapc can be set appropriately.
+	 */
+	list_for_each_entry(realm, &dirty_realms, dirty_item) {
+		queue_realm_cap_snaps(realm);
+	}
+
 	__cleanup_empty_realms(mdsc);
 	return 0;
 
@@ -688,7 +719,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
 		igrab(inode);
 		spin_unlock(&mdsc->snap_flush_lock);
 		spin_lock(&inode->i_lock);
-		__ceph_flush_snaps(ci, &session);
+		__ceph_flush_snaps(ci, &session, 0);
 		spin_unlock(&inode->i_lock);
 		iput(inode);
 		spin_lock(&mdsc->snap_flush_lock);
@@ -718,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		      struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
-	struct super_block *sb = mdsc->client->sb;
+	struct super_block *sb = mdsc->fsc->sb;
 	int mds = session->s_mds;
 	u64 split;
 	int op;
@@ -789,6 +820,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			};
 			struct inode *inode = ceph_find_inode(sb, vino);
 			struct ceph_inode_info *ci;
+			struct ceph_snap_realm *oldrealm;
 
 			if (!inode)
 				continue;
@@ -814,18 +846,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			dout(" will move %p to split realm %llx %p\n",
 			     inode, realm->ino, realm);
 			/*
-			 * Remove the inode from the realm's inode
-			 * list, but don't add it to the new realm
-			 * yet.  We don't want the cap_snap to be
-			 * queued (again) by ceph_update_snap_trace()
-			 * below.  Queue it _now_, under the old context.
+			 * Move the inode to the new realm
 			 */
 			spin_lock(&realm->inodes_with_caps_lock);
 			list_del_init(&ci->i_snap_realm_item);
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			oldrealm = ci->i_snap_realm;
+			ci->i_snap_realm = realm;
 			spin_unlock(&realm->inodes_with_caps_lock);
 			spin_unlock(&inode->i_lock);
 
-			ceph_queue_cap_snap(ci);
+			ceph_get_snap_realm(mdsc, realm);
+			ceph_put_snap_realm(mdsc, oldrealm);
 
 			iput(inode);
 			continue;
@@ -853,43 +886,9 @@ skip_inode:
 	ceph_update_snap_trace(mdsc, p, e,
 			       op == CEPH_SNAP_OP_DESTROY);
 
-	if (op == CEPH_SNAP_OP_SPLIT) {
-		/*
-		 * ok, _now_ add the inodes into the new realm.
-		 */
-		for (i = 0; i < num_split_inos; i++) {
-			struct ceph_vino vino = {
-				.ino = le64_to_cpu(split_inos[i]),
-				.snap = CEPH_NOSNAP,
-			};
-			struct inode *inode = ceph_find_inode(sb, vino);
-			struct ceph_inode_info *ci;
-
-			if (!inode)
-				continue;
-			ci = ceph_inode(inode);
-			spin_lock(&inode->i_lock);
-			if (list_empty(&ci->i_snap_realm_item)) {
-				struct ceph_snap_realm *oldrealm =
-					ci->i_snap_realm;
-
-				dout(" moving %p to split realm %llx %p\n",
-				     inode, realm->ino, realm);
-				spin_lock(&realm->inodes_with_caps_lock);
-				list_add(&ci->i_snap_realm_item,
-					 &realm->inodes_with_caps);
-				ci->i_snap_realm = realm;
-				spin_unlock(&realm->inodes_with_caps_lock);
-				ceph_get_snap_realm(mdsc, realm);
-				ceph_put_snap_realm(mdsc, oldrealm);
-			}
-			spin_unlock(&inode->i_lock);
-			iput(inode);
-		}
-
+	if (op == CEPH_SNAP_OP_SPLIT)
 		/* we took a reference when we created the realm, above */
 		ceph_put_snap_realm(mdsc, realm);
-	}
 
 	__cleanup_empty_realms(mdsc);
 
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index 7503aee828c..cd5097d7c80 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,70 +1,9 @@
 /*
- * Ceph string constants
+ * Ceph fs string constants
  */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 
-const char *ceph_entity_type_name(int type)
-{
-	switch (type) {
-	case CEPH_ENTITY_TYPE_MDS: return "mds";
-	case CEPH_ENTITY_TYPE_OSD: return "osd";
-	case CEPH_ENTITY_TYPE_MON: return "mon";
-	case CEPH_ENTITY_TYPE_CLIENT: return "client";
-	case CEPH_ENTITY_TYPE_AUTH: return "auth";
-	default: return "unknown";
-	}
-}
-
-const char *ceph_osd_op_name(int op)
-{
-	switch (op) {
-	case CEPH_OSD_OP_READ: return "read";
-	case CEPH_OSD_OP_STAT: return "stat";
-
-	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
-	case CEPH_OSD_OP_WRITE: return "write";
-	case CEPH_OSD_OP_DELETE: return "delete";
-	case CEPH_OSD_OP_TRUNCATE: return "truncate";
-	case CEPH_OSD_OP_ZERO: return "zero";
-	case CEPH_OSD_OP_WRITEFULL: return "writefull";
-
-	case CEPH_OSD_OP_APPEND: return "append";
-	case CEPH_OSD_OP_STARTSYNC: return "startsync";
-	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
-	case CEPH_OSD_OP_TMAPUP: return "tmapup";
-	case CEPH_OSD_OP_TMAPGET: return "tmapget";
-	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
-	case CEPH_OSD_OP_GETXATTR: return "getxattr";
-	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-	case CEPH_OSD_OP_SETXATTR: return "setxattr";
-	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
-	case CEPH_OSD_OP_PULL: return "pull";
-	case CEPH_OSD_OP_PUSH: return "push";
-	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-	case CEPH_OSD_OP_SCRUB: return "scrub";
-
-	case CEPH_OSD_OP_WRLOCK: return "wrlock";
-	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-	case CEPH_OSD_OP_RDLOCK: return "rdlock";
-	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-	case CEPH_OSD_OP_UPLOCK: return "uplock";
-	case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
-	case CEPH_OSD_OP_CALL: return "call";
-
-	case CEPH_OSD_OP_PGLS: return "pgls";
-	}
-	return "???";
-}
 
 const char *ceph_mds_state_name(int s)
 {
@@ -129,6 +68,8 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LSSNAP: return "lssnap";
 	case CEPH_MDS_OP_MKSNAP: return "mksnap";
 	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
 	}
 	return "???";
 }
@@ -174,17 +115,3 @@ const char *ceph_snap_op_name(int o)
 	}
 	return "???";
 }
-
-const char *ceph_pool_op_name(int op)
-{
-	switch (op) {
-	case POOL_OP_CREATE: return "create";
-	case POOL_OP_DELETE: return "delete";
-	case POOL_OP_AUID_CHANGE: return "auid change";
-	case POOL_OP_CREATE_SNAP: return "create snap";
-	case POOL_OP_DELETE_SNAP: return "delete snap";
-	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-	}
-	return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fa87f51e38e..08b460ae053 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,7 +1,8 @@
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/backing-dev.h>
+#include <linux/ctype.h>
 #include <linux/fs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
@@ -14,10 +15,13 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
 
-#include "decode.h"
 #include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 
 /*
  * Ceph superblock operations
@@ -25,36 +29,22 @@
  * Handle the basics of mounting, unmounting.
  */
 
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-	const char *e = s + len;
-
-	while (e != s && *(e-1) != '/')
-		e--;
-	return e;
-}
-
-
 /*
  * super ops
  */
 static void ceph_put_super(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 
 	dout("put_super\n");
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 
 	/*
 	 * ensure we release the bdi before put_anon_super releases
 	 * the device name.
 	 */
-	if (s->s_bdi == &client->backing_dev_info) {
-		bdi_unregister(&client->backing_dev_info);
+	if (s->s_bdi == &fsc->backing_dev_info) {
+		bdi_unregister(&fsc->backing_dev_info);
 		s->s_bdi = NULL;
 	}
 
@@ -63,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
 
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
-	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = fsc->client->monc.monmap;
 	struct ceph_statfs st;
 	u64 fsid;
 	int err;
 
 	dout("statfs\n");
-	err = ceph_monc_do_statfs(&client->monc, &st);
+	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
 	if (err < 0)
 		return err;
 
@@ -101,236 +91,30 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 
-static int ceph_syncfs(struct super_block *sb, int wait)
-{
-	dout("sync_fs %d\n", wait);
-	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
-	dout("sync_fs %d done\n", wait);
-	return 0;
-}
-
-static int default_congestion_kb(void)
-{
-	int congestion_kb;
-
-	/*
-	 * Copied from NFS
-	 *
-	 * congestion size, scale with available memory.
-	 *
-	 *  64MB:    8192k
-	 * 128MB:   11585k
-	 * 256MB:   16384k
-	 * 512MB:   23170k
-	 *   1GB:   32768k
-	 *   2GB:   46340k
-	 *   4GB:   65536k
-	 *   8GB:   92681k
-	 *  16GB:  131072k
-	 *
-	 * This allows larger machines to have larger/more transfers.
-	 * Limit the default to 256M
-	 */
-	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-	if (congestion_kb > 256*1024)
-		congestion_kb = 256*1024;
-
-	return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-	struct ceph_mount_args *args = client->mount_args;
-
-	if (args->flags & CEPH_OPT_FSID)
-		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
-			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
-			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
-	if (args->flags & CEPH_OPT_NOSHARE)
-		seq_puts(m, ",noshare");
-	if (args->flags & CEPH_OPT_DIRSTAT)
-		seq_puts(m, ",dirstat");
-	if ((args->flags & CEPH_OPT_RBYTES) == 0)
-		seq_puts(m, ",norbytes");
-	if (args->flags & CEPH_OPT_NOCRC)
-		seq_puts(m, ",nocrc");
-	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-		seq_puts(m, ",noasyncreaddir");
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 
-	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-		seq_printf(m, ",osdkeepalivetimeout=%d",
-			 args->osd_keepalive_timeout);
-	if (args->wsize)
-		seq_printf(m, ",wsize=%d", args->wsize);
-	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-		seq_printf(m, ",rsize=%d", args->rsize);
-	if (args->congestion_kb != default_congestion_kb())
-		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_min=%d",
-			 args->caps_wanted_delay_min);
-	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-		seq_printf(m, ",caps_wanted_delay_max=%d",
-			   args->caps_wanted_delay_max);
-	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-		seq_printf(m, ",cap_release_safety=%d",
-			   args->cap_release_safety);
-	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-	if (args->name)
-		seq_printf(m, ",name=%s", args->name);
-	if (args->secret)
-		seq_puts(m, ",secret=<hidden>");
-	return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-	struct ceph_inode_info *ci = foo;
-	inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
-	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-				      sizeof(struct ceph_inode_info),
-				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
-	if (ceph_inode_cachep == NULL)
-		return -ENOMEM;
-
-	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_cap_cachep == NULL)
-		goto bad_cap;
-
-	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_dentry_cachep == NULL)
-		goto bad_dentry;
-
-	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-	if (ceph_file_cachep == NULL)
-		goto bad_file;
+	if (!wait) {
+		dout("sync_fs (non-blocking)\n");
+		ceph_flush_dirty_caps(fsc->mdsc);
+		dout("sync_fs (non-blocking) done\n");
+		return 0;
+	}
 
+	dout("sync_fs (blocking)\n");
+	ceph_osdc_sync(&fsc->client->osdc);
+	ceph_mdsc_sync(fsc->mdsc);
+	dout("sync_fs (blocking) done\n");
 	return 0;
-
-bad_file:
-	kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-	kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-	kmem_cache_destroy(ceph_inode_cachep);
-	return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
-	kmem_cache_destroy(ceph_inode_cachep);
-	kmem_cache_destroy(ceph_cap_cachep);
-	kmem_cache_destroy(ceph_dentry_cachep);
-	kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-	struct ceph_client *client = ceph_sb_to_client(sb);
-
-	dout("ceph_umount_begin - starting forced umount\n");
-	if (!client)
-		return;
-	client->mount_state = CEPH_MOUNT_SHUTDOWN;
-	return;
 }
 
-static const struct super_operations ceph_super_ops = {
-	.alloc_inode	= ceph_alloc_inode,
-	.destroy_inode	= ceph_destroy_inode,
-	.write_inode    = ceph_write_inode,
-	.sync_fs        = ceph_syncfs,
-	.put_super	= ceph_put_super,
-	.show_options   = ceph_show_options,
-	.statfs		= ceph_statfs,
-	.umount_begin   = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
-	switch (type) {
-	case CEPH_MSG_SHUTDOWN: return "shutdown";
-	case CEPH_MSG_PING: return "ping";
-	case CEPH_MSG_AUTH: return "auth";
-	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-	case CEPH_MSG_MON_MAP: return "mon_map";
-	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-	case CEPH_MSG_STATFS: return "statfs";
-	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-	case CEPH_MSG_MDS_MAP: return "mds_map";
-	case CEPH_MSG_CLIENT_SESSION: return "client_session";
-	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-	case CEPH_MSG_OSD_MAP: return "osd_map";
-	case CEPH_MSG_OSD_OP: return "osd_op";
-	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-	default: return "unknown";
-	}
-}
-
-
 /*
  * mount options
  */
 enum {
-	Opt_fsidmajor,
-	Opt_fsidminor,
-	Opt_monport,
 	Opt_wsize,
 	Opt_rsize,
-	Opt_osdtimeout,
-	Opt_osdkeepalivetimeout,
-	Opt_mount_timeout,
-	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_cap_release_safety,
@@ -340,30 +124,18 @@ enum {
 	Opt_last_int,
 	/* int args above */
 	Opt_snapdirname,
-	Opt_name,
-	Opt_secret,
 	Opt_last_string,
 	/* string args above */
-	Opt_ip,
-	Opt_noshare,
 	Opt_dirstat,
 	Opt_nodirstat,
 	Opt_rbytes,
 	Opt_norbytes,
-	Opt_nocrc,
 	Opt_noasyncreaddir,
 };
 
-static match_table_t arg_tokens = {
-	{Opt_fsidmajor, "fsidmajor=%ld"},
-	{Opt_fsidminor, "fsidminor=%ld"},
-	{Opt_monport, "monport=%d"},
+static match_table_t fsopt_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
-	{Opt_osdtimeout, "osdtimeout=%d"},
-	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-	{Opt_mount_timeout, "mount_timeout=%d"},
-	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -372,372 +144,458 @@ static match_table_t arg_tokens = {
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
-	{Opt_name, "name=%s"},
-	{Opt_secret, "secret=%s"},
 	/* string args above */
-	{Opt_ip, "ip=%s"},
-	{Opt_noshare, "noshare"},
 	{Opt_dirstat, "dirstat"},
 	{Opt_nodirstat, "nodirstat"},
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
-	{Opt_nocrc, "nocrc"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
 	{-1, NULL}
 };
 
-
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
-						const char *dev_name,
-						const char **path)
+static int parse_fsopt_token(char *c, void *private)
 {
-	struct ceph_mount_args *args;
-	const char *c;
-	int err = -ENOMEM;
+	struct ceph_mount_options *fsopt = private;
 	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token((char *)c, fsopt_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
+		}
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else {
+		dout("got token %d\n", token);
+	}
 
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return ERR_PTR(-ENOMEM);
-	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
-				 GFP_KERNEL);
-	if (!args->mon_addr)
-		goto out;
-
-	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
-	/* start with defaults */
-	args->sb_flags = flags;
-	args->flags = CEPH_OPT_DEFAULT;
-	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
-	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-	args->congestion_kb = default_congestion_kb();
-
-	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-	err = -EINVAL;
-	if (!dev_name)
-		goto out;
-	*path = strstr(dev_name, ":/");
-	if (*path == NULL) {
-		pr_err("device name is missing path (no :/ in %s)\n",
-		       dev_name);
-		goto out;
+	switch (token) {
+	case Opt_snapdirname:
+		kfree(fsopt->snapdir_name);
+		fsopt->snapdir_name = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->snapdir_name)
+			return -ENOMEM;
+		break;
+
+		/* misc */
+	case Opt_wsize:
+		fsopt->wsize = intval;
+		break;
+	case Opt_rsize:
+		fsopt->rsize = intval;
+		break;
+	case Opt_caps_wanted_delay_min:
+		fsopt->caps_wanted_delay_min = intval;
+		break;
+	case Opt_caps_wanted_delay_max:
+		fsopt->caps_wanted_delay_max = intval;
+		break;
+	case Opt_readdir_max_entries:
+		fsopt->max_readdir = intval;
+		break;
+	case Opt_readdir_max_bytes:
+		fsopt->max_readdir_bytes = intval;
+		break;
+	case Opt_congestion_kb:
+		fsopt->congestion_kb = intval;
+		break;
+	case Opt_dirstat:
+		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_nodirstat:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+		break;
+	case Opt_rbytes:
+		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_norbytes:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+		break;
+	case Opt_noasyncreaddir:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+		break;
+	default:
+		BUG_ON(token);
 	}
+	return 0;
+}
 
-	/* get mon ip(s) */
-	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
-			     CEPH_MAX_MON, &args->num_mon);
-	if (err < 0)
-		goto out;
+static void destroy_mount_options(struct ceph_mount_options *args)
+{
+	dout("destroy_mount_options %p\n", args);
+	kfree(args->snapdir_name);
+	kfree(args);
+}
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+				 struct ceph_options *new_opt,
+				 struct ceph_fs_client *fsc)
+{
+	struct ceph_mount_options *fsopt1 = new_fsopt;
+	struct ceph_mount_options *fsopt2 = fsc->mount_options;
+	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+	int ret;
+
+	ret = memcmp(fsopt1, fsopt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+	if (ret)
+		return ret;
+
+	return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+			       struct ceph_options **popt,
+			       int flags, char *options,
+			       const char *dev_name,
+			       const char **path)
+{
+	struct ceph_mount_options *fsopt;
+	const char *dev_name_end;
+	int err = -ENOMEM;
+
+	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+	if (!fsopt)
+		return -ENOMEM;
+
+	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+	
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+	dev_name_end = *path;
+	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 
 	/* path on server */
 	*path += 2;
 	dout("server path '%s'\n", *path);
 
-	/* parse mount options */
-	while ((c = strsep(&options, ",")) != NULL) {
-		int token, intval, ret;
-		if (!*c)
-			continue;
-		err = -EINVAL;
-		token = match_token((char *)c, arg_tokens, argstr);
-		if (token < 0) {
-			pr_err("bad mount option at '%s'\n", c);
-			goto out;
-		}
-		if (token < Opt_last_int) {
-			ret = match_int(&argstr[0], &intval);
-			if (ret < 0) {
-				pr_err("bad mount option arg (not int) "
-				       "at '%s'\n", c);
-				continue;
-			}
-			dout("got int token %d val %d\n", token, intval);
-		} else if (token > Opt_last_int && token < Opt_last_string) {
-			dout("got string token %d val %s\n", token,
-			     argstr[0].from);
-		} else {
-			dout("got token %d\n", token);
-		}
-		switch (token) {
-		case Opt_fsidmajor:
-			*(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
-			break;
-		case Opt_fsidminor:
-			*(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
-			break;
-		case Opt_ip:
-			err = ceph_parse_ips(argstr[0].from,
-					     argstr[0].to,
-					     &args->my_addr,
-					     1, NULL);
-			if (err < 0)
-				goto out;
-			args->flags |= CEPH_OPT_MYIP;
-			break;
-
-		case Opt_snapdirname:
-			kfree(args->snapdir_name);
-			args->snapdir_name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_name:
-			args->name = kstrndup(argstr[0].from,
-					      argstr[0].to-argstr[0].from,
-					      GFP_KERNEL);
-			break;
-		case Opt_secret:
-			args->secret = kstrndup(argstr[0].from,
-						argstr[0].to-argstr[0].from,
-						GFP_KERNEL);
-			break;
-
-			/* misc */
-		case Opt_wsize:
-			args->wsize = intval;
-			break;
-		case Opt_rsize:
-			args->rsize = intval;
-			break;
-		case Opt_osdtimeout:
-			args->osd_timeout = intval;
-			break;
-		case Opt_osdkeepalivetimeout:
-			args->osd_keepalive_timeout = intval;
-			break;
-		case Opt_mount_timeout:
-			args->mount_timeout = intval;
-			break;
-		case Opt_caps_wanted_delay_min:
-			args->caps_wanted_delay_min = intval;
-			break;
-		case Opt_caps_wanted_delay_max:
-			args->caps_wanted_delay_max = intval;
-			break;
-		case Opt_readdir_max_entries:
-			args->max_readdir = intval;
-			break;
-		case Opt_readdir_max_bytes:
-			args->max_readdir_bytes = intval;
-			break;
-		case Opt_congestion_kb:
-			args->congestion_kb = intval;
-			break;
-
-		case Opt_noshare:
-			args->flags |= CEPH_OPT_NOSHARE;
-			break;
-
-		case Opt_dirstat:
-			args->flags |= CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_nodirstat:
-			args->flags &= ~CEPH_OPT_DIRSTAT;
-			break;
-		case Opt_rbytes:
-			args->flags |= CEPH_OPT_RBYTES;
-			break;
-		case Opt_norbytes:
-			args->flags &= ~CEPH_OPT_RBYTES;
-			break;
-		case Opt_nocrc:
-			args->flags |= CEPH_OPT_NOCRC;
-			break;
-		case Opt_noasyncreaddir:
-			args->flags |= CEPH_OPT_NOASYNCREADDIR;
-			break;
-
-		default:
-			BUG_ON(token);
-		}
-	}
-	return args;
+	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+				 parse_fsopt_token, (void *)fsopt);
+	if (err)
+		goto out;
+
+	/* success */
+	*pfsopt = fsopt;
+	return 0;
 
 out:
-	kfree(args->mon_addr);
-	kfree(args);
-	return ERR_PTR(err);
+	destroy_mount_options(fsopt);
+	return err;
 }
 
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-	dout("destroy_mount_args %p\n", args);
-	kfree(args->snapdir_name);
-	args->snapdir_name = NULL;
-	kfree(args->name);
-	args->name = NULL;
-	kfree(args->secret);
-	args->secret = NULL;
-	kfree(args);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_options *fsopt = fsc->mount_options;
+	struct ceph_options *opt = fsc->client->options;
+
+	if (opt->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsid=%pU", &opt->fsid);
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (opt->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+
+	if (opt->name)
+		seq_printf(m, ",name=%s", opt->name);
+	if (opt->secret)
+		seq_puts(m, ",secret=<hidden>");
+
+	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			   opt->osd_keepalive_timeout);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (fsopt->wsize)
+		seq_printf(m, ",wsize=%d", fsopt->wsize);
+	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", fsopt->rsize);
+	if (fsopt->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 fsopt->caps_wanted_delay_min);
+	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   fsopt->caps_wanted_delay_max);
+	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   fsopt->cap_release_safety);
+	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+	return 0;
 }
 
 /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
  */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 {
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc = client->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(fsc->mdsc, msg);
+		return 0;
+
+	default:
+		return -1;
+	}
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+					struct ceph_options *opt)
+{
+	struct ceph_fs_client *fsc;
 	int err = -ENOMEM;
 
-	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (client == NULL)
+	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_init(&client->mount_mutex);
-
-	init_waitqueue_head(&client->auth_wq);
+	fsc->client = ceph_create_client(opt, fsc);
+	if (IS_ERR(fsc->client)) {
+		err = PTR_ERR(fsc->client);
+		goto fail;
+	}
+	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+	fsc->client->monc.want_mdsmap = 1;
 
-	client->sb = NULL;
-	client->mount_state = CEPH_MOUNT_MOUNTING;
-	client->mount_args = args;
+	fsc->mount_options = fsopt;
 
-	client->msgr = NULL;
+	fsc->sb = NULL;
+	fsc->mount_state = CEPH_MOUNT_MOUNTING;
 
-	client->auth_err = 0;
-	atomic_long_set(&client->writeback_count, 0);
+	atomic_long_set(&fsc->writeback_count, 0);
 
-	err = bdi_init(&client->backing_dev_info);
+	err = bdi_init(&fsc->backing_dev_info);
 	if (err < 0)
-		goto fail;
+		goto fail_client;
 
 	err = -ENOMEM;
-	client->wb_wq = create_workqueue("ceph-writeback");
-	if (client->wb_wq == NULL)
+	fsc->wb_wq = create_workqueue("ceph-writeback");
+	if (fsc->wb_wq == NULL)
 		goto fail_bdi;
-	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
-	if (client->pg_inv_wq == NULL)
+	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (fsc->pg_inv_wq == NULL)
 		goto fail_wb_wq;
-	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
-	if (client->trunc_wq == NULL)
+	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (fsc->trunc_wq == NULL)
 		goto fail_pg_inv_wq;
 
 	/* set up mempools */
 	err = -ENOMEM;
-	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-	if (!client->wb_pagevec_pool)
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
 	/* caps */
-	client->min_caps = args->max_readdir;
-	ceph_adjust_min_caps(client->min_caps);
+	fsc->min_caps = fsopt->max_readdir;
+
+	return fsc;
 
-	/* subsystems */
-	err = ceph_monc_init(&client->monc, client);
-	if (err < 0)
-		goto fail_mempool;
-	err = ceph_osdc_init(&client->osdc, client);
-	if (err < 0)
-		goto fail_monc;
-	err = ceph_mdsc_init(&client->mdsc, client);
-	if (err < 0)
-		goto fail_osdc;
-	return client;
-
-fail_osdc:
-	ceph_osdc_stop(&client->osdc);
-fail_monc:
-	ceph_monc_stop(&client->monc);
-fail_mempool:
-	mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
-	destroy_workqueue(client->trunc_wq);
+	destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
-	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
-	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(fsc->wb_wq);
 fail_bdi:
-	bdi_destroy(&client->backing_dev_info);
+	bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+	ceph_destroy_client(fsc->client);
 fail:
-	kfree(client);
+	kfree(fsc);
 	return ERR_PTR(err);
 }
 
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-	dout("destroy_client %p\n", client);
+	dout("destroy_fs_client %p\n", fsc);
 
-	/* unmount */
-	ceph_mdsc_stop(&client->mdsc);
-	ceph_osdc_stop(&client->osdc);
+	destroy_workqueue(fsc->wb_wq);
+	destroy_workqueue(fsc->pg_inv_wq);
+	destroy_workqueue(fsc->trunc_wq);
 
-	/*
-	 * make sure mds and osd connections close out before destroying
-	 * the auth module, which is needed to free those connections'
-	 * ceph_authorizers.
-	 */
-	ceph_msgr_flush();
-
-	ceph_monc_stop(&client->monc);
+	bdi_destroy(&fsc->backing_dev_info);
 
-	ceph_adjust_min_caps(-client->min_caps);
+	mempool_destroy(fsc->wb_pagevec_pool);
 
-	ceph_debugfs_client_cleanup(client);
-	destroy_workqueue(client->wb_wq);
-	destroy_workqueue(client->pg_inv_wq);
-	destroy_workqueue(client->trunc_wq);
+	destroy_mount_options(fsc->mount_options);
 
-	bdi_destroy(&client->backing_dev_info);
+	ceph_fs_debugfs_cleanup(fsc);
 
-	if (client->msgr)
-		ceph_messenger_destroy(client->msgr);
-	mempool_destroy(client->wb_pagevec_pool);
+	ceph_destroy_client(fsc->client);
 
-	destroy_mount_args(client->mount_args);
-
-	kfree(client);
-	dout("destroy_client %p done\n", client);
+	kfree(fsc);
+	dout("destroy_fs_client %p done\n", fsc);
 }
 
 /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
  */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
 {
-	if (client->have_fsid) {
-		if (ceph_fsid_compare(&client->fsid, fsid)) {
-			pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
-			       PR_FSID(&client->fsid), PR_FSID(fsid));
-			return -1;
-		}
-	} else {
-		pr_info("client%lld fsid " FSID_FORMAT "\n",
-			client->monc.auth->global_id, PR_FSID(fsid));
-		memcpy(&client->fsid, fsid, sizeof(*fsid));
-		ceph_debugfs_client_init(client);
-		client->have_fsid = true;
-	}
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
 	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
 }
 
+
 /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
 {
-	return client->monc.monmap && client->monc.monmap->epoch &&
-	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!fsc)
+		return;
+	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
 }
 
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_sync_fs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
 /*
  * Bootstrap mount by opening the root directory.  Note the mount
  * @started time from caller, and time out if this takes too long.
  */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				       const char *path,
 				       unsigned long started)
 {
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req = NULL;
 	int err;
 	struct dentry *root;
@@ -751,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = client->mount_args->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout * HZ;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (err == 0) {
 		dout("open_root_inode success\n");
 		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-		    client->sb->s_root == NULL)
+		    fsc->sb->s_root == NULL)
 			root = d_alloc_root(req->r_target_inode);
 		else
 			root = d_obtain_alias(req->r_target_inode);
@@ -771,105 +629,84 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	return root;
 }
 
+
+
+
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 		      const char *path)
 {
-	struct ceph_entity_addr *myaddr = NULL;
 	int err;
-	unsigned long timeout = client->mount_args->mount_timeout * HZ;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
+	int first = 0;   /* first vfsmount for this super_block */
 
 	dout("mount start\n");
-	mutex_lock(&client->mount_mutex);
-
-	/* initialize the messenger */
-	if (client->msgr == NULL) {
-		if (ceph_test_opt(client, MYIP))
-			myaddr = &client->mount_args->my_addr;
-		client->msgr = ceph_messenger_create(myaddr);
-		if (IS_ERR(client->msgr)) {
-			err = PTR_ERR(client->msgr);
-			client->msgr = NULL;
-			goto out;
-		}
-		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-	}
+	mutex_lock(&fsc->client->mount_mutex);
 
-	/* open session, and wait for mon, mds, and osd maps */
-	err = ceph_monc_open_session(&client->monc);
+	err = __ceph_open_session(fsc->client, started);
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_and_osd_map(client)) {
-		err = -EIO;
-		if (timeout && time_after_eq(jiffies, started + timeout))
-			goto out;
-
-		/* wait */
-		dout("mount waiting for mon_map\n");
-		err = wait_event_interruptible_timeout(client->auth_wq,
-		       have_mon_and_osd_map(client) || (client->auth_err < 0),
-		       timeout);
-		if (err == -EINTR || err == -ERESTARTSYS)
-			goto out;
-		if (client->auth_err < 0) {
-			err = client->auth_err;
-			goto out;
-		}
-	}
-
 	dout("mount opening root\n");
-	root = open_root_dentry(client, "", started);
+	root = open_root_dentry(fsc, "", started);
 	if (IS_ERR(root)) {
 		err = PTR_ERR(root);
 		goto out;
 	}
-	if (client->sb->s_root)
+	if (fsc->sb->s_root) {
 		dput(root);
-	else
-		client->sb->s_root = root;
+	} else {
+		fsc->sb->s_root = root;
+		first = 1;
+
+		err = ceph_fs_debugfs_init(fsc);
+		if (err < 0)
+			goto fail;
+	}
 
 	if (path[0] == 0) {
 		dget(root);
 	} else {
 		dout("mount opening base mountpoint\n");
-		root = open_root_dentry(client, path, started);
+		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
-			dput(client->sb->s_root);
-			client->sb->s_root = NULL;
-			goto out;
+			goto fail;
 		}
 	}
 
-	mnt->mnt_root = root;
-	mnt->mnt_sb = client->sb;
-
-	client->mount_state = CEPH_MOUNT_MOUNTED;
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	dout("mount success\n");
-	err = 0;
+	mutex_unlock(&fsc->client->mount_mutex);
+	return root;
 
 out:
-	mutex_unlock(&client->mount_mutex);
-	return err;
+	mutex_unlock(&fsc->client->mount_mutex);
+	return ERR_PTR(err);
+
+fail:
+	if (first) {
+		dput(fsc->sb->s_root);
+		fsc->sb->s_root = NULL;
+	}
+	goto out;
 }
 
 static int ceph_set_super(struct super_block *s, void *data)
 {
-	struct ceph_client *client = data;
+	struct ceph_fs_client *fsc = data;
 	int ret;
 
 	dout("set_super %p data %p\n", s, data);
 
-	s->s_flags = client->mount_args->sb_flags;
+	s->s_flags = fsc->mount_options->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 
-	s->s_fs_info = client;
-	client->sb = s;
+	s->s_fs_info = fsc;
+	fsc->sb = s;
 
 	s->s_op = &ceph_super_ops;
 	s->s_export_op = &ceph_export_ops;
@@ -884,7 +721,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 
 fail:
 	s->s_fs_info = NULL;
-	client->sb = NULL;
+	fsc->sb = NULL;
 	return ret;
 }
 
@@ -893,30 +730,23 @@ fail:
  */
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
-	struct ceph_client *new = data;
-	struct ceph_mount_args *args = new->mount_args;
-	struct ceph_client *other = ceph_sb_to_client(sb);
-	int i;
+	struct ceph_fs_client *new = data;
+	struct ceph_mount_options *fsopt = new->mount_options;
+	struct ceph_options *opt = new->client->options;
+	struct ceph_fs_client *other = ceph_sb_to_client(sb);
 
 	dout("ceph_compare_super %p\n", sb);
-	if (args->flags & CEPH_OPT_FSID) {
-		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
-			dout("fsid doesn't match\n");
-			return 0;
-		}
-	} else {
-		/* do we share (a) monitor? */
-		for (i = 0; i < new->monc.monmap->num_mon; i++)
-			if (ceph_monmap_contains(other->monc.monmap,
-					 &new->monc.monmap->mon_inst[i].addr))
-				break;
-		if (i == new->monc.monmap->num_mon) {
-			dout("mon ip not part of monmap\n");
-			return 0;
-		}
-		dout("mon ip matches existing sb %p\n", sb);
+
+	if (compare_mount_options(fsopt, opt, other)) {
+		dout("monitor(s)/mount options don't match\n");
+		return 0;
+	}
+	if ((opt->flags & CEPH_OPT_FSID) &&
+	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+		dout("fsid doesn't match\n");
+		return 0;
 	}
-	if (args->sb_flags != other->mount_args->sb_flags) {
+	if (fsopt->sb_flags != other->mount_options->sb_flags) {
 		dout("flags differ\n");
 		return 0;
 	}
@@ -928,98 +758,113 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+			     struct ceph_fs_client *fsc)
 {
 	int err;
 
 	/* set ra_pages based on rsize mount option? */
-	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
-		client->backing_dev_info.ra_pages =
-			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+		fsc->backing_dev_info.ra_pages =
+			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
 			   atomic_long_inc_return(&bdi_seq));
 	if (!err)
-		sb->s_bdi = &client->backing_dev_info;
+		sb->s_bdi = &fsc->backing_dev_info;
 	return err;
 }
 
-static int ceph_get_sb(struct file_system_type *fs_type,
-		       int flags, const char *dev_name, void *data,
-		       struct vfsmount *mnt)
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data)
 {
 	struct super_block *sb;
-	struct ceph_client *client;
+	struct ceph_fs_client *fsc;
+	struct dentry *res;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 	const char *path = NULL;
-	struct ceph_mount_args *args;
+	struct ceph_mount_options *fsopt = NULL;
+	struct ceph_options *opt = NULL;
 
-	dout("ceph_get_sb\n");
-	args = parse_mount_args(flags, data, dev_name, &path);
-	if (IS_ERR(args)) {
-		err = PTR_ERR(args);
+	dout("ceph_mount\n");
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	if (err < 0) {
+		res = ERR_PTR(err);
 		goto out_final;
 	}
 
 	/* create client (which we may/may not use) */
-	client = ceph_create_client(args);
-	if (IS_ERR(client)) {
-		err = PTR_ERR(client);
+	fsc = create_fs_client(fsopt, opt);
+	if (IS_ERR(fsc)) {
+		res = ERR_CAST(fsc);
+		kfree(fsopt);
+		kfree(opt);
 		goto out_final;
 	}
 
-	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+	err = ceph_mdsc_init(fsc);
+	if (err < 0) {
+		res = ERR_PTR(err);
+		goto out;
+	}
+
+	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
-	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
 	if (IS_ERR(sb)) {
-		err = PTR_ERR(sb);
+		res = ERR_CAST(sb);
 		goto out;
 	}
 
-	if (ceph_sb_to_client(sb) != client) {
-		ceph_destroy_client(client);
-		client = ceph_sb_to_client(sb);
-		dout("get_sb got existing client %p\n", client);
+	if (ceph_sb_to_client(sb) != fsc) {
+		ceph_mdsc_destroy(fsc);
+		destroy_fs_client(fsc);
+		fsc = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", fsc);
 	} else {
-		dout("get_sb using new client %p\n", client);
-		err = ceph_register_bdi(sb, client);
-		if (err < 0)
+		dout("get_sb using new client %p\n", fsc);
+		err = ceph_register_bdi(sb, fsc);
+		if (err < 0) {
+			res = ERR_PTR(err);
 			goto out_splat;
+		}
 	}
 
-	err = ceph_mount(client, mnt, path);
-	if (err < 0)
+	res = ceph_real_mount(fsc, path);
+	if (IS_ERR(res))
 		goto out_splat;
-	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
-	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
-	return 0;
+	dout("root %p inode %p ino %llx.%llx\n", res,
+	     res->d_inode, ceph_vinop(res->d_inode));
+	return res;
 
 out_splat:
-	ceph_mdsc_close_sessions(&client->mdsc);
+	ceph_mdsc_close_sessions(fsc->mdsc);
 	deactivate_locked_super(sb);
 	goto out_final;
 
 out:
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 out_final:
-	dout("ceph_get_sb fail %d\n", err);
-	return err;
+	dout("ceph_mount fail %ld\n", PTR_ERR(res));
+	return res;
 }
 
 static void ceph_kill_sb(struct super_block *s)
 {
-	struct ceph_client *client = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 	dout("kill_sb %p\n", s);
-	ceph_mdsc_pre_umount(&client->mdsc);
+	ceph_mdsc_pre_umount(fsc->mdsc);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
-	ceph_destroy_client(client);
+	ceph_mdsc_destroy(fsc);
+	destroy_fs_client(fsc);
 }
 
 static struct file_system_type ceph_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ceph",
-	.get_sb		= ceph_get_sb,
+	.mount		= ceph_mount,
 	.kill_sb	= ceph_kill_sb,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE,
 };
@@ -1029,38 +874,20 @@ static struct file_system_type ceph_fs_type = {
 
 static int __init init_ceph(void)
 {
-	int ret = 0;
-
-	ret = ceph_debugfs_init();
-	if (ret < 0)
-		goto out;
-
-	ret = ceph_msgr_init();
-	if (ret < 0)
-		goto out_debugfs;
-
-	ret = init_caches();
+	int ret = init_caches();
 	if (ret)
-		goto out_msgr;
-
-	ceph_caps_init();
+		goto out;
 
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
 
-	pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
 	return 0;
 
 out_icache:
 	destroy_caches();
-out_msgr:
-	ceph_msgr_exit();
-out_debugfs:
-	ceph_debugfs_cleanup();
 out:
 	return ret;
 }
@@ -1069,10 +896,7 @@ static void __exit exit_ceph(void)
 {
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
-	ceph_caps_finalize();
 	destroy_caches();
-	ceph_msgr_exit();
-	ceph_debugfs_cleanup();
 }
 
 module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 10a4a406e88..1886294e12f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
 #ifndef _FS_CEPH_SUPER_H
 #define _FS_CEPH_SUPER_H
 
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
 
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
 
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,36 +24,25 @@
 #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
-/*
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
-#define ceph_set_opt(client, opt) \
-	(client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
-	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define ceph_set_mount_opt(fsc, opt) \
+	(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
 
-struct ceph_mount_args {
-	int sb_flags;
+struct ceph_mount_options {
 	int flags;
-	struct ceph_fsid fsid;
-	struct ceph_entity_addr my_addr;
-	int num_mon;
-	struct ceph_entity_addr *mon_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_timeout;
-	int osd_keepalive_timeout;
+	int sb_flags;
+
 	int wsize;
 	int rsize;            /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
@@ -67,82 +50,25 @@ struct ceph_mount_args {
 	int cap_release_safety;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
-	char *snapdir_name;   /* default ".snap" */
-	char *name;
-	char *secret;
-};
-
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
-	CEPH_MOUNT_MOUNTING,
-	CEPH_MOUNT_MOUNTED,
-	CEPH_MOUNT_UNMOUNTING,
-	CEPH_MOUNT_UNMOUNTED,
-	CEPH_MOUNT_SHUTDOWN,
-};
 
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-	BUG_ON(time_after(b, a));
-	return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-	struct ceph_fsid fsid;
-	bool have_fsid;
+	/*
+	 * everything above this point can be memcmp'd; everything below
+	 * is handled in compare_mount_options()
+	 */
 
-	struct mutex mount_mutex;       /* serialize mount attempts */
-	struct ceph_mount_args *mount_args;
+	char *snapdir_name;   /* default ".snap" */
+};
 
+struct ceph_fs_client {
 	struct super_block *sb;
 
-	unsigned long mount_state;
-	wait_queue_head_t auth_wq;
-
-	int auth_err;
+	struct ceph_mount_options *mount_options;
+	struct ceph_client *client;
 
+	unsigned long mount_state;
 	int min_caps;                  /* min caps i added */
 
-	struct ceph_messenger *msgr;   /* messenger instance */
-	struct ceph_mon_client monc;
-	struct ceph_mds_client mdsc;
-	struct ceph_osd_client osdc;
+	struct ceph_mds_client *mdsc;
 
 	/* writeback */
 	mempool_t *wb_pagevec_pool;
@@ -154,14 +80,14 @@ struct ceph_client {
 	struct backing_dev_info backing_dev_info;
 
 #ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_monmap;
-	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_dentry_lru, *debugfs_caps;
 	struct dentry *debugfs_congestion_kb;
 	struct dentry *debugfs_bdi;
+	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
 };
 
+
 /*
  * File i/o capability.  This tracks shared state with the metadata
  * server that allows us to cache or writeback attributes or to read
@@ -210,8 +136,7 @@ struct ceph_cap_snap {
 	uid_t uid;
 	gid_t gid;
 
-	void *xattr_blob;
-	int xattr_len;
+	struct ceph_buffer *xattr_blob;
 	u64 xattr_version;
 
 	u64 size;
@@ -223,8 +148,11 @@ struct ceph_cap_snap {
 
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 {
-	if (atomic_dec_and_test(&capsnap->nref))
+	if (atomic_dec_and_test(&capsnap->nref)) {
+		if (capsnap->xattr_blob)
+			ceph_buffer_put(capsnap->xattr_blob);
 		kfree(capsnap);
+	}
 }
 
 /*
@@ -267,6 +195,20 @@ struct ceph_inode_xattr {
 	int should_free_val;
 };
 
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -288,11 +230,6 @@ struct ceph_inode_xattrs_info {
 /*
  * Ceph inode.
  */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
-
 struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
@@ -336,7 +273,8 @@ struct ceph_inode_info {
 	unsigned i_cap_exporting_issued;
 	struct ceph_cap_reservation i_cap_migration_resv;
 	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
-	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
 
 	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
@@ -382,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
 	return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
 
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -405,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool r;
 
-	smp_mb();
+	spin_lock(&inode->i_lock);
 	r = (ci->i_ceph_flags & mask) == mask;
+	spin_unlock(&inode->i_lock);
 	return r;
 }
 
@@ -423,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 			    struct ceph_inode_frag *pfrag,
 			    int *found);
 
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-	struct ceph_mds_session *lease_session;
-	u32 lease_gen, lease_shared_gen;
-	u32 lease_seq;
-	unsigned long lease_renew_after, lease_renew_from;
-	struct list_head lru;
-	struct dentry *dentry;
-	u64 time;
-	u64 offset;
-};
-
 static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 {
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -447,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 	return ((loff_t)frag << 32) | (loff_t)off;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-	if (!ino)
-		ino = 1;
-#endif
-	return ino;
-}
-
 static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 {
 	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -470,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 	return 0;
 }
 
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-	return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-	struct ceph_vino *pvino = (struct ceph_vino *)data;
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	return ci->i_vino.ino == pvino->ino &&
-		ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-					    struct ceph_vino vino)
-{
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
 /*
  * caps helpers
  */
@@ -560,23 +493,25 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 /* what the mds thinks we want */
 extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
 
-extern void ceph_caps_init(void);
-extern void ceph_caps_finalize(void);
-extern void ceph_adjust_min_caps(int delta);
-extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
-extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+			       struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_fs_client *client,
 				    int *total, int *avail, int *used,
 				    int *reserved, int *min);
 
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
 {
-	return (struct ceph_client *)inode->i_sb->s_fs_info;
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
 
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
 {
-	return (struct ceph_client *)sb->s_fs_info;
+	return (struct ceph_fs_client *)sb->s_fs_info;
 }
 
 
@@ -606,51 +541,6 @@ struct ceph_file_info {
 
 
 /*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-	atomic_t nref;
-	u64 seq;
-	int num_snaps;
-	u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-	/*
-	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)+1);
-	*/
-	if (sc)
-		atomic_inc(&sc->nref);
-	return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-	if (!sc)
-		return;
-	/*
-	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)-1);
-	*/
-	if (atomic_dec_and_test(&sc->nref)) {
-		/*printk(" deleting snap_context %p\n", sc);*/
-		kfree(sc);
-	}
-}
-
-/*
  * A "snap realm" describes a subset of the file hierarchy sharing
  * the same set of snapshots that apply to it.  The realms themselves
  * are organized into a hierarchy, such that children inherit (some of)
@@ -679,6 +569,8 @@ struct ceph_snap_realm {
 
 	struct list_head empty_item;     /* if i have ref==0 */
 
+	struct list_head dirty_item;     /* if realm needs new context */
+
 	/* the current set of snaps for this realm */
 	struct ceph_snap_context *cached_context;
 
@@ -686,16 +578,33 @@ struct ceph_snap_realm {
 	spinlock_t inodes_with_caps_lock;
 };
 
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
 {
-	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-		(off >> PAGE_CACHE_SHIFT);
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
 }
 
 
@@ -728,23 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 			   ci_item)->writing;
 }
 
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
-#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
-	"%02x%02x%02x%02x%02x%02x"
-#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
-		(f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
-		(f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
-		(f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
-
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
 
@@ -806,20 +698,24 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
 	__ceph_remove_cap(cap);
 	spin_unlock(&inode->i_lock);
 }
-extern void ceph_put_cap(struct ceph_cap *cap);
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+			 struct ceph_cap *cap);
 
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+					     int mds);
 extern int ceph_get_cap_mds(struct inode *inode);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				       struct ceph_snap_context *snapc);
 extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
-			       struct ceph_mds_session **psession);
+			       struct ceph_mds_session **psession,
+			       int again);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			    struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -847,17 +743,23 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 				       struct nameidata *nd, int mode,
 				       int locked_dir);
 extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
 
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct inode_operations ceph_dir_iops;
-extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -882,11 +784,13 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* export.c */
 extern const struct export_operations ceph_export_ops;
 
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+/* locks.c */
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
+			     int p_locks, int f_locks);
+extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 
 static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 {
@@ -896,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 	return NULL;
 }
 
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
 #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec..00000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
-	u64 ino;
-	u64 snap;
-};
-
-
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
-	int count;
-};
-
-
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 68aeebc6968..6e12a6ba5f7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
 
 #include <linux/xattr.h>
 #include <linux/slab.h>
@@ -337,6 +340,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 }
 
 static int __build_xattrs(struct inode *inode)
+	__releases(inode->i_lock)
+	__acquires(inode->i_lock)
 {
 	u32 namelen;
 	u32 numattr = 0;
@@ -483,6 +488,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = NULL;
 		ci->i_xattrs.dirty = false;
+		ci->i_xattrs.version++;
 	}
 }
 
@@ -617,12 +623,12 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
 	int i, nr_pages;
 	struct page **pages = NULL;
@@ -710,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
-	newname = kmalloc(name_len + 1, GFP_NOFS);
+	newname = kmemdup(name, name_len + 1, GFP_NOFS);
 	if (!newname)
 		goto out;
-	memcpy(newname, name, name_len + 1);
 
 	if (val_len) {
 		newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -774,8 +779,8 @@ out:
 
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
-	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode = dentry->d_parent->d_inode;
 	struct ceph_mds_request *req;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b..e5b9df993b9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/tty.h>
 
 #include "internal.h"
 
@@ -39,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
 #endif
 		/* permit direct mmap, for read, write or exec */
 		BDI_CAP_MAP_DIRECT |
-		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+		/* no writeback happens */
+		BDI_CAP_NO_ACCT_AND_WRITEBACK),
 };
 
 static struct kobj_map *cdev_map;
@@ -453,6 +456,7 @@ static void cdev_purge(struct cdev *cdev)
  */
 const struct file_operations def_chr_fops = {
 	.open = chrdev_open,
+	.llseek = noop_llseek,
 };
 
 static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f35259680..0ed213970ce 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,9 @@ config CIFS
 	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
 	depends on INET
 	select NLS
-	select SLOW_WORK
+	select CRYPTO
+	select CRYPTO_MD5
+	select CRYPTO_ARC4
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +73,14 @@ config CIFS_WEAK_PW_HASH
 	  If unsure, say N.
 
 config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup"
-	  depends on CIFS && KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
+	bool "Kerberos/SPNEGO advanced session setup"
+	depends on CIFS && KEYS
+	select DNS_RESOLVER
+	help
+	  Enables an upcall mechanism for CIFS which accesses userspace helper
+	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
+	  which are needed to mount to certain secure servers (for which more
+	  secure Kerberos authentication is required). If unsure, say N.
 
 config CIFS_XATTR
         bool "CIFS extended attributes"
@@ -122,6 +124,7 @@ config CIFS_DEBUG2
 config CIFS_DFS_UPCALL
 	  bool "DFS feature support"
 	  depends on CIFS && KEYS
+	  select DNS_RESOLVER
 	  help
 	    Distributed File System (DFS) support is used to access shares
 	    transparently in an enterprise name space, even if the share
@@ -131,6 +134,15 @@ config CIFS_DFS_UPCALL
 	    IP addresses) which is needed for implicit mounts of DFS junction
 	    points. If unsure, say N.
 
+config CIFS_FSCACHE
+	  bool "Provide CIFS client caching support (EXPERIMENTAL)"
+	  depends on EXPERIMENTAL
+	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+	  help
+	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+	    to be cached locally on disk through the general filesystem cache
+	    manager. If unsure, say N.
+
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e8..adefa60a9bd 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075..ee68d103654 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
   gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
+  fsc		Enable local disk caching using FS-Cache (off by default). This
+  		option could be useful to improve performance on a slow link,
+		heavily loaded server and/or network where reading from the
+		disk is faster than reading from the server (over the network).
+		This could also impact scalability positively as the
+		number of calls to the server are reduced. However, local
+		caching is not suitable for all workloads for e.g. read-once
+		type workloads. So, you need to consider carefully your
+		workload/scenario before using this option. Currently, local
+		disk caching is functional for CIFS files opened as read-only.
   dir_mode      If CIFS Unix extensions are not supported by the server 
 		this overrides the default mode for directory inodes.
   port		attempt to contact the server on this tcp port, before
@@ -517,6 +527,11 @@ A partial list of the supported mount options follows:
 		SFU does).  In the future the bottom 9 bits of the
 		mode also will be emulated using queries of the security
 		descriptor (ACL).
+ mfsymlinks     Enable support for Minshall+French symlinks
+		(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+		This option is ignored when specified together with the
+		'sfu' option. Minshall+French symlinks are used even if
+		the server supports the CIFS Unix Extensions.
  sign           Must use packet signing (helps avoid unwanted data modification
 		by intermediate systems in the route).  Note that signing
 		does not work with lanman or plaintext authentication.
@@ -568,8 +583,9 @@ module can be displayed via modinfo.
 Misc /proc/fs/cifs Flags and Debug Info
 =======================================
 Informational pseudo-files:
-DebugData		Displays information about active CIFS sessions
-			and shares, as well as the cifs.ko version.
+DebugData		Displays information about active CIFS sessions and
+			shares, features enabled as well as the cifs.ko
+			version.
 Stats			Lists summary resource usage information as well as per
 			share statistics, if CONFIG_CIFS_STATS in enabled
 			in the kernel configuration.
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 00000000000..224d7bbd1fc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
+/*
+ *   fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+	.name = "cifs",
+	.version = 0,
+};
+
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+	return fscache_register_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+	fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+	uint16_t	family;		/* address family */
+	uint16_t	port;		/* IP port */
+	union {
+		struct in_addr	ipv4_addr;
+		struct in6_addr	ipv6_addr;
+	} addr[0];
+};
+
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+				   void *buffer, uint16_t maxbuf)
+{
+	const struct TCP_Server_Info *server = cookie_netfs_data;
+	const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+	struct cifs_server_key *key = buffer;
+	uint16_t key_len = sizeof(struct cifs_server_key);
+
+	memset(key, 0, key_len);
+
+	/*
+	 * Should not be a problem as sin_family/sin6_family overlays
+	 * sa_family field
+	 */
+	switch (sa->sa_family) {
+	case AF_INET:
+		key->family = server->addr.sockAddr.sin_family;
+		key->port = server->addr.sockAddr.sin_port;
+		key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+		key_len += sizeof(key->addr[0].ipv4_addr);
+		break;
+
+	case AF_INET6:
+		key->family = server->addr.sockAddr6.sin6_family;
+		key->port = server->addr.sockAddr6.sin6_port;
+		key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+		key_len += sizeof(key->addr[0].ipv6_addr);
+		break;
+
+	default:
+		cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+		key_len = 0;
+		break;
+	}
+
+	return key_len;
+}
+
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+	.name = "CIFS.server",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_server_get_key,
+};
+
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+	u64	resource_id;		/* unique server resource id */
+};
+
+static char *extract_sharename(const char *treename)
+{
+	const char *src;
+	char *delim, *dst;
+	int len;
+
+	/* skip double chars at the beginning */
+	src = treename + 2;
+
+	/* share name is always preceded by '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+	delim++;
+	len = strlen(delim);
+
+	/* caller has to free the memory */
+	dst = kstrndup(delim, len, GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	return dst;
+}
+
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+				   uint16_t maxbuf)
+{
+	const struct cifsTconInfo *tcon = cookie_netfs_data;
+	char *sharename;
+	uint16_t len;
+
+	sharename = extract_sharename(tcon->treeName);
+	if (IS_ERR(sharename)) {
+		cFYI(1, "CIFS: couldn't extract sharename\n");
+		sharename = NULL;
+		return 0;
+	}
+
+	len = strlen(sharename);
+	if (len > maxbuf)
+		return 0;
+
+	memcpy(buffer, sharename, len);
+
+	kfree(sharename);
+
+	return len;
+}
+
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifsTconInfo *tcon = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_super_auxdata auxdata;
+	const struct cifsTconInfo *tcon = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.resource_id = tcon->resource_id;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+	.name = "CIFS.super",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key = cifs_super_get_key,
+	.get_aux = cifs_fscache_super_get_aux,
+	.check_aux = cifs_fscache_super_check_aux,
+};
+
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+	struct timespec	last_write_time;
+	struct timespec	last_change_time;
+	u64		eof;
+};
+
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+					   void *buffer, uint16_t maxbuf)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	uint16_t keylen;
+
+	/* use the UniqueId as the key */
+	keylen = sizeof(cifsi->uniqueid);
+	if (keylen > maxbuf)
+		keylen = 0;
+	else
+		memcpy(buffer, &cifsi->uniqueid, keylen);
+
+	return keylen;
+}
+
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	*size = cifsi->vfs_inode.i_size;
+}
+
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+			   uint16_t maxbuf)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (maxbuf > sizeof(auxdata))
+		maxbuf = sizeof(auxdata);
+
+	memcpy(buffer, &auxdata, maxbuf);
+
+	return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+					      const void *data,
+					      uint16_t datalen)
+{
+	struct cifs_fscache_inode_auxdata auxdata;
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+	if (datalen != sizeof(auxdata))
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	memset(&auxdata, 0, sizeof(auxdata));
+	auxdata.eof = cifsi->server_eof;
+	auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+	auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+	if (memcmp(data, &auxdata, datalen) != 0)
+		return FSCACHE_CHECKAUX_OBSOLETE;
+
+	return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+	struct cifsInodeInfo *cifsi = cookie_netfs_data;
+	struct pagevec pvec;
+	pgoff_t first;
+	int loop, nr_pages;
+
+	pagevec_init(&pvec, 0);
+	first = 0;
+
+	cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+
+	for (;;) {
+		nr_pages = pagevec_lookup(&pvec,
+					  cifsi->vfs_inode.i_mapping, first,
+					  PAGEVEC_SIZE - pagevec_count(&pvec));
+		if (!nr_pages)
+			break;
+
+		for (loop = 0; loop < nr_pages; loop++)
+			ClearPageFsCache(pvec.pages[loop]);
+
+		first = pvec.pages[nr_pages - 1]->index + 1;
+
+		pvec.nr = nr_pages;
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+	.name		= "CIFS.uniqueid",
+	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
+	.get_key	= cifs_fscache_inode_get_key,
+	.get_attr	= cifs_fscache_inode_get_attr,
+	.get_aux	= cifs_fscache_inode_get_aux,
+	.check_aux	= cifs_fscache_inode_check_aux,
+	.now_uncached	= cifs_fscache_inode_now_uncached,
+};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4fce6e61b34..103ab8b605b 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,11 +119,36 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 		    "Display Internal CIFS Data Structures for Debugging\n"
 		    "---------------------------------------------------\n");
 	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+	seq_printf(m, "Features: ");
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	seq_printf(m, "dfs");
+	seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+	seq_printf(m, "fscache");
+	seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	seq_printf(m, "lanman");
+	seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_POSIX
+	seq_printf(m, "posix");
+	seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	seq_printf(m, "spnego");
+	seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_XATTR
+	seq_printf(m, "xattr");
+#endif
+	seq_putc(m, '\n');
 	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
 	seq_printf(m, "Servers:");
 
 	i = 0;
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp1, &cifs_tcp_ses_list) {
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
@@ -205,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			spin_unlock(&GlobalMid_Lock);
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
 	/* BB add code to dump additional info such as TCP session info now */
@@ -245,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		atomic_set(&totBufAllocCount, 0);
 		atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-		read_lock(&cifs_tcp_ses_lock);
+		spin_lock(&cifs_tcp_ses_lock);
 		list_for_each(tmp1, &cifs_tcp_ses_list) {
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
@@ -278,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 				}
 			}
 		}
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 	}
 
 	return count;
@@ -318,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		GlobalCurrentXid, GlobalMaxActiveXid);
 
 	i = 0;
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp1, &cifs_tcp_ses_list) {
 		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
@@ -372,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 			}
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0..8942b28cf80 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
 extern int traceSMB;		/* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO	0x01
-#define CIFS_RC  	0x02
+#define CIFS_RC		0x02
 #define CIFS_TIMER	0x04
 
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ac19a6f3dae..c68a056f27f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
 void cifs_dfs_release_automount_timer(void)
 {
 	BUG_ON(!list_empty(&cifs_dfs_automount_list));
-	cancel_delayed_work(&cifs_dfs_automount_task);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&cifs_dfs_automount_task);
 }
 
 /**
@@ -141,7 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	}
 
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
-	if (rc != 0) {
+	if (rc < 0) {
 		cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
 			  __func__, *devname, rc);
 		goto compose_mount_options_err;
@@ -150,8 +149,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	 * assuming that we have 'unc=' and 'ip=' in
 	 * the original sb_mountdata
 	 */
-	md_len = strlen(sb_mountdata) + strlen(srvIP) +
-		strlen(ref->node_name) + 12;
+	md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
 	mountdata = kzalloc(md_len+1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		rc = -ENOMEM;
@@ -230,28 +228,22 @@ compose_mount_options_err:
 	goto compose_mount_options_out;
 }
 
-
-static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
-		struct dentry *dentry, const struct dfs_info3_param *ref)
+/**
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * @cifs_sb:		parent/root superblock
+ * @fullpath:		full path in UNC format
+ * @ref:		server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+		const char *fullpath, const struct dfs_info3_param *ref)
 {
-	struct cifs_sb_info *cifs_sb;
 	struct vfsmount *mnt;
 	char *mountdata;
 	char *devname = NULL;
-	char *fullpath;
-
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	/*
-	 * this function gives us a path with a double backslash prefix. We
-	 * require a single backslash for DFS.
-	 */
-	fullpath = build_path_from_dentry(dentry);
-	if (!fullpath)
-		return ERR_PTR(-ENOMEM);
 
+	/* strip first '\' from fullpath */
 	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
 			fullpath + 1, ref, &devname);
-	kfree(fullpath);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
@@ -313,6 +305,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	int xid, i;
 	int rc = 0;
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct tcon_link *tlink;
 
 	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(dentry));
@@ -322,14 +315,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
 
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	ses = cifs_sb->tcon->ses;
-
-	if (!ses) {
-		rc = -EINVAL;
-		goto out_err;
-	}
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
@@ -342,10 +327,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 	}
 
-	rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls,
+	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto out_err;
+	}
+	ses = tlink_tcon(tlink)->ses;
+
+	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
 		&num_referrals, &referrals,
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+	cifs_put_tlink(tlink);
+
 	for (i = 0; i < num_referrals; i++) {
 		int len;
 		dump_referral(referrals+i);
@@ -357,8 +352,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			rc = -EINVAL;
 			goto out_err;
 		}
-		mnt = cifs_dfs_do_refmount(nd->path.mnt,
-				nd->path.dentry, referrals + i);
+		mnt = cifs_dfs_do_refmount(cifs_sb,
+				full_path, referrals + i);
 		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
 					referrals[i].node_name, mnt);
 
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb91..525ba59a410 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
  *   the GNU Lesser General Public License for more details.
  *
  */
+#include <linux/radix-tree.h>
+
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
 
@@ -35,23 +37,29 @@
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE	0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS	0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER	0x20000 /* multiuser mount */
 
 struct cifs_sb_info {
-	struct cifsTconInfo *tcon;	/* primary mount */
-	struct list_head nested_tcon_q;
+	struct radix_tree_root tlink_tree;
+#define CIFS_TLINK_MASTER_TAG		0	/* is "master" (mount) tcon */
+	spinlock_t tlink_tree_lock;
 	struct nls_table *local_nls;
 	unsigned int rsize;
 	unsigned int wsize;
+	atomic_t active;
 	uid_t	mnt_uid;
 	gid_t	mnt_gid;
 	mode_t	mnt_file_mode;
 	mode_t	mnt_dir_mode;
-	int     mnt_cifs_flags;
+	unsigned int mnt_cifs_flags;
 	int	prepathlen;
 	char   *prepath; /* relative path under the share to mount to */
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	char   *mountdata; /* mount options received at mount time */
 #endif
 	struct backing_dev_info bdi;
+	struct delayed_work prune_tlinks;
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 379bd7d9c05..87044906cd1 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
 /* strlen of ";uid=0x" */
 #define UID_KEY_LEN		7
 
+/* strlen of ";creduid=0x" */
+#define CREDUID_KEY_LEN		11
+
 /* strlen of ";user=" */
 #define USER_KEY_LEN		6
 
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 		   IP_KEY_LEN + INET6_ADDRSTRLEN +
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   USER_KEY_LEN + strlen(sesInfo->userName) +
 		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
 
@@ -144,6 +148,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
 
 	dp = description + strlen(description);
+	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+
+	dp = description + strlen(description);
 	sprintf(dp, ";user=%s", sesInfo->userName);
 
 	dp = description + strlen(description);
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6..7fe6b52df50 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
  *     This is a compressed table of upper and lower case conversion.
  *
  */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
 
 #include <asm/byteorder.h>
 #include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
 #endif				/* UNIUPR_NOUPPER */
 
 #ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
  * UniTolower:  Convert a unicode character to lower case
  */
 static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
 {
-	register struct UniCaseRange *rp;
+	register const struct UniCaseRange *rp;
 
-	if (uc < sizeof(UniLowerTable)) {
+	if (uc < sizeof(CifsUniLowerTable)) {
 		/* Latin characters */
-		return uc + UniLowerTable[uc];	/* Use base tables */
+		return uc + CifsUniLowerTable[uc];	/* Use base tables */
 	} else {
-		rp = UniLowerRange;	/* Use range tables */
+		rp = CifsUniLowerRange;	/* Use range tables */
 		while (rp->start) {
 			if (uc < rp->start)	/* Before start of range */
 				return uc;	/* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
 }
 
 #endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e51..0ac7c5a8633 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
 /*
  * Latin lower case
  */
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
 /*
  * Lower Case Range
  */
-static const struct UniCaseRange CifsUniLowerRange[] = {
-	0x0380, 0x03ab, UniCaseRangeL0380,
-	0x0400, 0x042f, UniCaseRangeL0400,
-	0x0490, 0x04cb, UniCaseRangeL0490,
-	0x1e00, 0x1ff7, UniCaseRangeL1e00,
-	0xff20, 0xff3a, UniCaseRangeLff20,
-	0, 0, 0
+const struct UniCaseRange CifsUniLowerRange[] = {
+	{0x0380, 0x03ab, UniCaseRangeL0380},
+	{0x0400, 0x042f, UniCaseRangeL0400},
+	{0x0490, 0x04cb, UniCaseRangeL0490},
+	{0x1e00, 0x1ff7, UniCaseRangeL1e00},
+	{0xff20, 0xff3a, UniCaseRangeLff20},
+	{0}
 };
 #endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c..c9b4792ae82 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 {
 	struct cifs_ntsd *pntsd = NULL;
 	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return NULL;
 
 	xid = GetXid();
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
 	FreeXid(xid);
 
+	cifs_put_tlink(tlink);
 
 	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 	return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	int oplock = 0;
 	int xid, rc;
 	__u16 fid;
+	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return NULL;
 
+	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 		goto out;
 	}
 
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
 	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 
-	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+	CIFSSMBClose(xid, tcon, fid);
  out:
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return pntsd;
 }
@@ -603,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	struct cifsFileInfo *open_file = NULL;
 
 	if (inode)
-		open_file = find_readable_file(CIFS_I(inode));
+		open_file = find_readable_file(CIFS_I(inode), true);
 	if (!open_file)
 		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
 		struct cifs_ntsd *pnntsd, u32 acllen)
 {
 	int xid, rc;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
 
 	xid = GetXid();
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 	return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 	int oplock = 0;
 	int xid, rc;
 	__u16 fid;
+	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
 
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	tcon = tlink_tcon(tlink);
 	xid = GetXid();
 
-	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 		goto out;
 	}
 
-	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
-	CIFSSMBClose(xid, cifs_sb->tcon, fid);
- out:
+	CIFSSMBClose(xid, tcon, fid);
+out:
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -661,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
 
-	open_file = find_readable_file(CIFS_I(inode));
+	open_file = find_readable_file(CIFS_I(inode), true);
 	if (!open_file)
 		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc4..f856732161a 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
 
@@ -42,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 		       unsigned char *p24);
 
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				    const struct mac_key *key, char *signature)
+				struct TCP_Server_Info *server, char *signature)
 {
-	struct	MD5Context context;
+	int rc;
 
-	if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+	if (cifs_pdu == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
-	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 
-	cifs_MD5_final(signature, &context);
 	return 0;
 }
 
@@ -78,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
-				      smb_signature);
+	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -89,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				const struct mac_key *key, char *signature)
+				struct TCP_Server_Info *server, char *signature)
 {
-	struct  MD5Context context;
 	int i;
+	int rc;
 
-	if ((iov == NULL) || (signature == NULL) || (key == NULL))
+	if (iov == NULL || signature == NULL || server == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "%s: Can't generate signature\n", __func__);
+		return -1;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Oould not init md5\n", __func__);
+		return rc;
+	}
+
+	crypto_shash_update(&server->secmech.sdescmd5->shash,
+		server->session_key.response, server->session_key.len);
+
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -111,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			cifs_MD5_update(&context, iov[0].iov_base+4,
-				  iov[0].iov_len-4);
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
 		} else
-			cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+			crypto_shash_update(&server->secmech.sdescmd5->shash,
+				iov[i].iov_base, iov[i].iov_len);
 	}
 
-	cifs_MD5_final(signature, &context);
+	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 
-	return 0;
+	return rc;
 }
 
-
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 		   __u32 *pexpected_response_sequence_number)
 {
@@ -145,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
-				      smb_signature);
+	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -156,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-			  const struct mac_key *mac_key,
+			  struct TCP_Server_Info *server,
 			  __u32 expected_sequence_number)
 {
 	unsigned int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
 
-	if ((cifs_pdu == NULL) || (mac_key == NULL))
+	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
 
 	if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 					cpu_to_le32(expected_sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	rc = cifs_calculate_signature(cifs_pdu, mac_key,
+	rc = cifs_calculate_signature(cifs_pdu, server,
 		what_we_think_sig_should_be);
 
 	if (rc)
@@ -208,75 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 
 }
 
-/* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
-			   const char *password)
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
+int setup_ntlm_response(struct cifsSesInfo *ses)
 {
-	char temp_key[16];
-	if ((key == NULL) || (rn == NULL))
-		return -EINVAL;
+	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
+	char temp_key[CIFS_SESS_KEY_SIZE];
 
-	E_md4hash(password, temp_key);
-	mdfour(key->data.ntlm, temp_key, 16);
-	memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
-	key->len = 40;
-	return 0;
-}
-
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
-			       const struct nls_table *nls_info)
-{
-	char temp_hash[16];
-	struct HMACMD5Context ctx;
-	char *ucase_buf;
-	__le16 *unicode_buf;
-	unsigned int i, user_name_len, dom_name_len;
-
-	if (ses == NULL)
-		return -EINVAL;
-
-	E_md4hash(ses->password, temp_hash);
-
-	hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
-	user_name_len = strlen(ses->userName);
-	if (user_name_len > MAX_USERNAME_SIZE)
-		return -EINVAL;
-	if (ses->domainName == NULL)
-		return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
-	dom_name_len = strlen(ses->domainName);
-	if (dom_name_len > MAX_USERNAME_SIZE)
+	if (!ses)
 		return -EINVAL;
 
-	ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
-	if (ucase_buf == NULL)
-		return -ENOMEM;
-	unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
-	if (unicode_buf == NULL) {
-		kfree(ucase_buf);
+	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
 		return -ENOMEM;
 	}
+	ses->auth_key.len = temp_len;
+
+	SMBNTencrypt(ses->password, ses->server->cryptkey,
+			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+
+	E_md4hash(ses->password, temp_key);
+	mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
 
-	for (i = 0; i < user_name_len; i++)
-		ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
-	ucase_buf[i] = 0;
-	user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
-				      MAX_USERNAME_SIZE*2, nls_info);
-	unicode_buf[user_name_len] = 0;
-	user_name_len++;
-
-	for (i = 0; i < dom_name_len; i++)
-		ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
-	ucase_buf[i] = 0;
-	dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
-				     MAX_USERNAME_SIZE*2, nls_info);
-
-	unicode_buf[user_name_len + dom_name_len] = 0;
-	hmac_md5_update((const unsigned char *) unicode_buf,
-		(user_name_len+dom_name_len)*2, &ctx);
-
-	hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
-	kfree(ucase_buf);
-	kfree(unicode_buf);
 	return 0;
 }
 
@@ -319,109 +297,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
+ */
+static int
+build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+	unsigned int dlen;
+	unsigned int wlen;
+	unsigned int size = 6 * sizeof(struct ntlmssp2_name);
+	__le64  curtime;
+	char *defdmname = "WORKGROUP";
+	unsigned char *blobptr;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->domainName) {
+		ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+		if (!ses->domainName)
+			return -ENOMEM;
+	}
+
+	dlen = strlen(ses->domainName);
+	wlen = strlen(ses->server->hostname);
+
+	/* The length of this blob is a size which is
+	 * six times the size of a structure which holds name/size +
+	 * two times the unicode length of a domain name +
+	 * two times the unicode length of a server name +
+	 * size of a timestamp (which is 8 bytes).
+	 */
+	ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		ses->auth_key.len = 0;
+		cERROR(1, "Challenge target info allocation failure");
+		return -ENOMEM;
+	}
+
+	blobptr = ses->auth_key.response;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
+	attrptr->length = cpu_to_le16(2 * dlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+
+	blobptr += 2 * dlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
+	attrptr->length = cpu_to_le16(2 * wlen);
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+
+	blobptr += 2 * wlen;
+	attrptr = (struct ntlmssp2_name *) blobptr;
+
+	attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
+	attrptr->length = cpu_to_le16(sizeof(__le64));
+	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+	curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	memcpy(blobptr, &curtime, sizeof(__le64));
+
+	return 0;
+}
+
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned int onesize = sizeof(struct ntlmssp2_name);
+	unsigned char *blobptr;
+	unsigned char *blobend;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->auth_key.len || !ses->auth_key.response)
+		return 0;
+
+	blobptr = ses->auth_key.response;
+	blobend = blobptr + ses->auth_key.len;
+
+	while (blobptr + onesize < blobend) {
+		attrptr = (struct ntlmssp2_name *) blobptr;
+		type = le16_to_cpu(attrptr->type);
+		if (type == NTLMSSP_AV_EOL)
+			break;
+		blobptr += 2; /* advance attr type */
+		attrsize = le16_to_cpu(attrptr->length);
+		blobptr += 2; /* advance attr size */
+		if (blobptr + attrsize > blobend)
+			break;
+		if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+			if (!attrsize)
+				break;
+			if (!ses->domainName) {
+				ses->domainName =
+					kmalloc(attrsize + 1, GFP_KERNEL);
+				if (!ses->domainName)
+						return -ENOMEM;
+				cifs_from_ucs2(ses->domainName,
+					(__le16 *)blobptr, attrsize, attrsize,
+					nls_cp, false);
+				break;
+			}
+		}
+		blobptr += attrsize; /* advance attr  value */
+	}
+
+	return 0;
+}
+
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
 			    const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	int len;
-	char nt_hash[16];
-	struct HMACMD5Context *pctxt;
+	char nt_hash[CIFS_NTHASH_SIZE];
 	wchar_t *user;
 	wchar_t *domain;
+	wchar_t *server;
 
-	pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
-
-	if (pctxt == NULL)
-		return -ENOMEM;
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
 
 	/* calculate md4 hash of password */
 	E_md4hash(ses->password, nt_hash);
 
-	/* convert Domainname to unicode and uppercase */
-	hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+	crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+				CIFS_NTHASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+		return rc;
+	}
 
 	/* convert ses->userName to unicode and uppercase */
 	len = strlen(ses->userName);
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
-	if (user == NULL)
+	if (user == NULL) {
+		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+		rc = -ENOMEM;
 		goto calc_exit_2;
+	}
 	len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
 	UniStrupr(user);
-	hmac_md5_update((char *)user, 2*len, pctxt);
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+				(char *)user, 2 * len);
 
 	/* convert ses->domainName to unicode and uppercase */
 	if (ses->domainName) {
 		len = strlen(ses->domainName);
 
 		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-		if (domain == NULL)
+		if (domain == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+			rc = -ENOMEM;
 			goto calc_exit_1;
+		}
 		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
 					nls_cp);
-		/* the following line was removed since it didn't work well
-		   with lower cased domain name that passed as an option.
-		   Maybe converting the domain name earlier makes sense */
-		/* UniStrupr(domain); */
-
-		hmac_md5_update((char *)domain, 2*len, pctxt);
-
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)domain, 2 * len);
 		kfree(domain);
+	} else if (ses->serverName) {
+		len = strlen(ses->serverName);
+
+		server = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (server == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+			rc = -ENOMEM;
+			goto calc_exit_1;
+		}
+		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+					nls_cp);
+		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+					(char *)server, 2 * len);
+		kfree(server);
 	}
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+					ntlmv2_hash);
+
 calc_exit_1:
 	kfree(user);
 calc_exit_2:
-	/* BB FIXME what about bytes 24 through 40 of the signing key?
-	   compare with the NTLM example */
-	hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+	return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+{
+	int rc;
+	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+
+	if (!ses->server->secmech.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
+
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+		return rc;
+	}
+
+	if (ses->server->secType == RawNTLMSSP)
+		memcpy(ses->auth_key.response + offset,
+			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	else
+		memcpy(ses->auth_key.response + offset,
+			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + offset, ses->auth_key.len - offset);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 
-	kfree(pctxt);
 	return rc;
 }
 
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
-		      const struct nls_table *nls_cp)
+
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
 	int rc;
-	struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
-	struct HMACMD5Context context;
+	int baselen;
+	unsigned int tilen;
+	struct ntlmv2_resp *buf;
+	char ntlmv2_hash[16];
+	unsigned char *tiblob = NULL; /* target info blob */
+
+	if (ses->server->secType == RawNTLMSSP) {
+		if (!ses->domainName) {
+			rc = find_domain_name(ses, nls_cp);
+			if (rc) {
+				cERROR(1, "error %d finding domain name", rc);
+				goto setup_ntlmv2_rsp_ret;
+			}
+		}
+	} else {
+		rc = build_avpair_blob(ses, nls_cp);
+		if (rc) {
+			cERROR(1, "error %d building av pair blob", rc);
+			goto setup_ntlmv2_rsp_ret;
+		}
+	}
 
+	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+	tilen = ses->auth_key.len;
+	tiblob = ses->auth_key.response;
+
+	ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+	if (!ses->auth_key.response) {
+		rc = ENOMEM;
+		ses->auth_key.len = 0;
+		cERROR(1, "%s: Can't allocate auth blob", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+	ses->auth_key.len += baselen;
+
+	buf = (struct ntlmv2_resp *)
+			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 	buf->blob_signature = cpu_to_le32(0x00000101);
 	buf->reserved = 0;
 	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
 	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
-	buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-	buf->names[0].length = 0;
-	buf->names[1].type = 0;
-	buf->names[1].length = 0;
 
-	/* calculate buf->ntlmv2_hash */
-	rc = calc_ntlmv2_hash(ses, nls_cp);
-	if (rc)
+	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+
+	/* calculate ntlmv2_hash */
+	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
-	CalcNTLMv2_response(ses, resp_buf);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* calculate first part of the client response (CR1) */
+	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+	if (rc) {
+		cERROR(1, "Could not calculate CR1  rc: %d", rc);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	/* now calculate the session key for NTLMv2 */
+	crypto_shash_setkey(ses->server->secmech.hmacmd5,
+		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+		goto setup_ntlmv2_rsp_ret;
+	}
+
+	crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+		CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+		ses->auth_key.response);
+
+setup_ntlmv2_rsp_ret:
+	kfree(tiblob);
+
+	return rc;
+}
+
+int
+calc_seckey(struct cifsSesInfo *ses)
+{
+	int rc;
+	struct crypto_blkcipher *tfm_arc4;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+
+	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+
+	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+		cERROR(1, "could not allocate crypto API arc4\n");
+		return PTR_ERR(tfm_arc4);
+	}
+
+	desc.tfm = tfm_arc4;
+
+	crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+					CIFS_SESS_KEY_SIZE);
 
-	/* now calculate the MAC key for NTLMv2 */
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
-	hmac_md5_update(resp_buf, 16, &context);
-	hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+	sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+	sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
 
-	memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
-	       sizeof(struct ntlmv2_resp));
-	ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+	if (rc) {
+		cERROR(1, "could not encrypt session key rc: %d\n", rc);
+		crypto_free_blkcipher(tfm_arc4);
+		return rc;
+	}
+
+	/* make secondary_key/nonce as session key */
+	memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+	/* and make len as that of session key only */
+	ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+
+	crypto_free_blkcipher(tfm_arc4);
+
+	return 0;
 }
 
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
-			 char *v2_session_response)
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
 {
-	struct HMACMD5Context context;
-	/* rest of v2 struct already generated */
-	memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+	if (server->secmech.md5)
+		crypto_free_shash(server->secmech.md5);
+
+	if (server->secmech.hmacmd5)
+		crypto_free_shash(server->secmech.hmacmd5);
 
-	hmac_md5_update(v2_session_response+8,
-			sizeof(struct ntlmv2_resp) - 8, &context);
+	kfree(server->secmech.sdeschmacmd5);
+
+	kfree(server->secmech.sdescmd5);
+}
 
-	hmac_md5_final(v2_session_response, &context);
-/*	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	int rc;
+	unsigned int size;
+
+	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (!server->secmech.hmacmd5 ||
+			IS_ERR(server->secmech.hmacmd5)) {
+		cERROR(1, "could not allocate crypto hmacmd5\n");
+		return PTR_ERR(server->secmech.hmacmd5);
+	}
+
+	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+		cERROR(1, "could not allocate crypto md5\n");
+		rc = PTR_ERR(server->secmech.md5);
+		goto crypto_allocate_md5_fail;
+	}
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.hmacmd5);
+	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdeschmacmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_hmacmd5_sdesc_fail;
+	}
+	server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+	server->secmech.sdeschmacmd5->shash.flags = 0x0;
+
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.md5);
+	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdescmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+		rc = -ENOMEM;
+		goto crypto_allocate_md5_sdesc_fail;
+	}
+	server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+	server->secmech.sdescmd5->shash.flags = 0x0;
+
+	return 0;
+
+crypto_allocate_md5_sdesc_fail:
+	kfree(server->secmech.sdeschmacmd5);
+
+crypto_allocate_hmacmd5_sdesc_fail:
+	crypto_free_shash(server->secmech.md5);
+
+crypto_allocate_md5_fail:
+	crypto_free_shash(server->secmech.hmacmd5);
+
+	return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2cb1a70214d..75c4eaa7958 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <net/ipv6.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -45,8 +45,8 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
-#include "dns_resolve.h"
 #include "cifs_spnego.h"
+#include "fscache.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
 int cifsFYI = 0;
@@ -82,6 +82,24 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
+void
+cifs_sb_active(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
 static int
 cifs_read_super(struct super_block *sb, void *data,
 		const char *devname, int silent)
@@ -97,6 +115,9 @@ cifs_read_super(struct super_block *sb, void *data,
 	if (cifs_sb == NULL)
 		return -ENOMEM;
 
+	spin_lock_init(&cifs_sb->tlink_tree_lock);
+	INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+
 	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
 	if (rc) {
 		kfree(cifs_sb);
@@ -136,9 +157,6 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
 	sb->s_bdi = &cifs_sb->bdi;
-/*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
-	    sb->s_blocksize =
-		cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
 	inode = cifs_root_iget(sb, ROOT_I);
@@ -200,8 +218,6 @@ cifs_put_super(struct super_block *sb)
 		return;
 	}
 
-	lock_kernel();
-
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +231,6 @@ cifs_put_super(struct super_block *sb)
 	unload_nls(cifs_sb->local_nls);
 	bdi_destroy(&cifs_sb->bdi);
 	kfree(cifs_sb);
-
-	unlock_kernel();
 }
 
 static int
@@ -224,7 +238,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 	int rc = -EOPNOTSUPP;
 	int xid;
 
@@ -304,7 +318,6 @@ cifs_alloc_inode(struct super_block *sb)
 		return NULL;
 	cifs_inode->cifsAttrs = 0x20;	/* default */
 	cifs_inode->time = 0;
-	cifs_inode->write_behind_rc = 0;
 	/* Until the file is open and we have gotten oplock
 	info back from the server, can not assume caching of
 	file data or metadata */
@@ -329,6 +342,14 @@ cifs_destroy_inode(struct inode *inode)
 }
 
 static void
+cifs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	cifs_fscache_release_inode_cookie(inode);
+}
+
+static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
 	seq_printf(s, ",addr=");
@@ -358,14 +379,36 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+	struct sockaddr *srcaddr;
+	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
 	seq_printf(s, ",unc=%s", tcon->treeName);
-	if (tcon->ses->userName)
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+		seq_printf(s, ",multiuser");
+	else if (tcon->ses->userName)
 		seq_printf(s, ",username=%s", tcon->ses->userName);
+
 	if (tcon->ses->domainName)
 		seq_printf(s, ",domain=%s", tcon->ses->domainName);
 
+	if (srcaddr->sa_family != AF_UNSPEC) {
+		struct sockaddr_in *saddr4;
+		struct sockaddr_in6 *saddr6;
+		saddr4 = (struct sockaddr_in *)srcaddr;
+		saddr6 = (struct sockaddr_in6 *)srcaddr;
+		if (srcaddr->sa_family == AF_INET6)
+			seq_printf(s, ",srcaddr=%pI6c",
+				   &saddr6->sin6_addr);
+		else if (srcaddr->sa_family == AF_INET)
+			seq_printf(s, ",srcaddr=%pI4",
+				   &saddr4->sin_addr.s_addr);
+		else
+			seq_printf(s, ",srcaddr=BAD-AF:%i",
+				   (int)(srcaddr->sa_family));
+	}
+
 	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 		seq_printf(s, ",forceuid");
@@ -414,6 +457,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 		seq_printf(s, ",dynperm");
 	if (m->mnt_sb->s_flags & MS_POSIXACL)
 		seq_printf(s, ",acl");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		seq_printf(s, ",mfsymlinks");
 
 	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -429,20 +474,18 @@ static void cifs_umount_begin(struct super_block *sb)
 	if (cifs_sb == NULL)
 		return;
 
-	tcon = cifs_sb->tcon;
-	if (tcon == NULL)
-		return;
+	tcon = cifs_sb_master_tcon(cifs_sb);
 
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
 		/* we have other mounts to same share or we have
 		   already tried to force umount this and woken up
 		   all waiting network requests, nothing to do */
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	} else if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
@@ -473,14 +516,13 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-void cifs_drop_inode(struct inode *inode)
+static int cifs_drop_inode(struct inode *inode)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-		return generic_drop_inode(inode);
-
-	return generic_delete_inode(inode);
+	/* no serverino => unconditional eviction */
+	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
+		generic_drop_inode(inode);
 }
 
 static const struct super_operations cifs_super_ops = {
@@ -489,6 +531,7 @@ static const struct super_operations cifs_super_ops = {
 	.alloc_inode = cifs_alloc_inode,
 	.destroy_inode = cifs_destroy_inode,
 	.drop_inode	= cifs_drop_inode,
+	.evict_inode	= cifs_evict_inode,
 /*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
 	function unless later we add lazy close of inodes or unless the
 	kernel forgets to call us with the same number of releases (closes)
@@ -501,28 +544,29 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
 
-static int
-cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+cifs_do_mount(struct file_system_type *fs_type,
+	    int flags, const char *dev_name, void *data)
 {
 	int rc;
-	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
+	struct super_block *sb;
+
+	sb = sget(fs_type, NULL, set_anon_super, NULL);
 
 	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	sb->s_flags = flags;
 
 	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
 		deactivate_locked_super(sb);
-		return rc;
+		return ERR_PTR(rc);
 	}
 	sb->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -557,9 +601,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with the BKL held
-	   although I doubt that BKL is needed here in cifs */
+	/* note that this is called by vfs setlease with lock_flocks held
+	   to protect *lease from going away */
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = file->private_data;
 
 	if (!(S_ISREG(inode->i_mode)))
 		return -EINVAL;
@@ -570,8 +615,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	    ((arg == F_WRLCK) &&
 		(CIFS_I(inode)->clientCanCacheAll)))
 		return generic_setlease(file, arg, lease);
-	else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
-			!CIFS_I(inode)->clientCanCacheRead)
+	else if (tlink_tcon(cfile->tlink)->local_lease &&
+		 !CIFS_I(inode)->clientCanCacheRead)
 		/* If the server claims to support oplock on this
 		   file, then we still need to check oplock even
 		   if the local_lease mount option is set, but there
@@ -587,7 +632,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
-	.get_sb = cifs_get_sb,
+	.mount = cifs_do_mount,
 	.kill_sb = kill_anon_super,
 	/*  .fs_flags */
 };
@@ -890,8 +935,8 @@ init_cifs(void)
 	GlobalTotalActiveXid = 0;
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
-	rwlock_init(&GlobalSMBSeslock);
-	rwlock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_tcp_ses_lock);
+	spin_lock_init(&cifs_file_list_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
 	if (cifs_max_pending < 2) {
@@ -902,10 +947,14 @@ init_cifs(void)
 		cFYI(1, "cifs_max_pending set to max of 256");
 	}
 
-	rc = cifs_init_inodecache();
+	rc = cifs_fscache_register();
 	if (rc)
 		goto out_clean_proc;
 
+	rc = cifs_init_inodecache();
+	if (rc)
+		goto out_unreg_fscache;
+
 	rc = cifs_init_mids();
 	if (rc)
 		goto out_destroy_inodecache;
@@ -922,34 +971,22 @@ init_cifs(void)
 	if (rc)
 		goto out_unregister_filesystem;
 #endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	rc = cifs_init_dns_resolver();
-	if (rc)
-		goto out_unregister_key_type;
-#endif
-	rc = slow_work_register_user(THIS_MODULE);
-	if (rc)
-		goto out_unregister_resolver_key;
 
 	return 0;
 
- out_unregister_resolver_key:
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	cifs_exit_dns_resolver();
- out_unregister_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
-	unregister_key_type(&cifs_spnego_key_type);
- out_unregister_filesystem:
-#endif
+out_unregister_filesystem:
 	unregister_filesystem(&cifs_fs_type);
- out_destroy_request_bufs:
+#endif
+out_destroy_request_bufs:
 	cifs_destroy_request_bufs();
- out_destroy_mids:
+out_destroy_mids:
 	cifs_destroy_mids();
- out_destroy_inodecache:
+out_destroy_inodecache:
 	cifs_destroy_inodecache();
- out_clean_proc:
+out_unreg_fscache:
+	cifs_fscache_unregister();
+out_clean_proc:
 	cifs_proc_clean();
 	return rc;
 }
@@ -959,9 +996,9 @@ exit_cifs(void)
 {
 	cFYI(DBG2, "exit_cifs");
 	cifs_proc_clean();
+	cifs_fscache_unregister();
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
-	cifs_exit_dns_resolver();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a7eb65c84b1..897b2b2b28b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
-/* extern const struct super_operations cifs_super_ops;*/
-extern void cifs_read_inode(struct inode *);
-/*extern void cifs_delete_inode(struct inode *);*/  /* BB not needed yet */
-/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
@@ -104,7 +102,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 			const char *symname);
 extern int	cifs_removexattr(struct dentry *, const char *);
-extern int 	cifs_setxattr(struct dentry *, const char *, const void *,
+extern int	cifs_setxattr(struct dentry *, const char *, const void *,
 			size_t, int);
 extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.64"
+#define CIFS_VERSION   "1.68"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a88479ceaad..f259e4d7612 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,12 +16,18 @@
  *   the GNU Lesser General Public License for more details.
  *
  */
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
+
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
-#include <linux/slow-work.h>
+#include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+
 /*
  * The sizes of various internal tables and strings
  */
@@ -34,7 +40,7 @@
 #define MAX_SHARE_SIZE  64	/* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32	/* 32 is to allow for 15 char names + null
 				   termination then *2 for unicode versions */
-#define MAX_PASSWORD_SIZE 16
+#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
 
 #define CIFS_MIN_RCV_POOL 4
 
@@ -71,7 +77,7 @@
  * CIFS vfs client Status information (based on what we know.)
  */
 
- /* associated with each tcp and smb session */
+/* associated with each tcp and smb session */
 enum statusEnum {
 	CifsNew = 0,
 	CifsGood,
@@ -80,8 +86,7 @@ enum statusEnum {
 };
 
 enum securityEnum {
-	PLAINTXT = 0, 		/* Legacy with Plaintext passwords */
-	LANMAN,			/* Legacy LANMAN auth */
+	LANMAN = 0,			/* Legacy LANMAN auth */
 	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
@@ -95,16 +100,31 @@ enum protocolEnum {
 	/* Netbios frames protocol not supported at this time */
 };
 
-struct mac_key {
+struct session_key {
 	unsigned int len;
-	union {
-		char ntlm[CIFS_SESS_KEY_SIZE + 16];
-		char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
-		struct {
-			char key[16];
-			struct ntlmv2_resp resp;
-		} ntlmv2;
-	} data;
+	char *response;
+};
+
+/* crypto security descriptor definition */
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+	struct crypto_shash *md5; /* md5 hash function */
+	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 };
 
 struct cifs_cred {
@@ -137,12 +157,12 @@ struct TCP_Server_Info {
 		struct sockaddr_in sockAddr;
 		struct sockaddr_in6 sockAddr6;
 	} addr;
+	struct sockaddr_storage srcaddr; /* locally bind to this IP */
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
 	void *Server_NlsInfo;	/* BB - placeholder for future NLS info  */
 	unsigned short server_codepage;	/* codepage for the server    */
-	unsigned long ip_address;	/* IP addr for the server if known */
 	enum protocolEnum protocolType;
 	char versionMajor;
 	char versionMinor;
@@ -177,32 +197,23 @@ struct TCP_Server_Info {
 	int capabilities; /* allow selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
-	char cryptKey[CIFS_CRYPTO_KEY_SIZE];
+	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
-	struct mac_key mac_signing_key;
-	char ntlmv2_hash[16];
+	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
+	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
 	/* extended security flavors that server supports */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
 	bool	sec_ntlmssp;		/* supports NTLMSSP */
-};
-
-/*
- * The following is our shortcut to user information.  We surface the uid,
- * and name. We always get the password on the fly in case it
- * has changed. We also hang a list of sessions owned by this user off here.
- */
-struct cifsUidInfo {
-	struct list_head userList;
-	struct list_head sessionList; /* SMB sessions for this user */
-	uid_t linux_uid;
-	char user[MAX_USERNAME_SIZE + 1];	/* ascii name of user */
-	/* BB may need ptr or callback for PAM or WinBind info */
+	bool session_estab; /* mark when very first sess is established */
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie   *fscache; /* client index cache cookie */
+#endif
 };
 
 /*
@@ -212,9 +223,6 @@ struct cifsSesInfo {
 	struct list_head smb_ses_list;
 	struct list_head tcon_list;
 	struct mutex session_mutex;
-#if 0
-	struct cifsUidInfo *uidInfo;	/* pointer to user info */
-#endif
 	struct TCP_Server_Info *server;	/* pointer to server info */
 	int ses_count;		/* reference counter */
 	enum statusEnum status;
@@ -226,13 +234,16 @@ struct cifsSesInfo {
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
 	int Suid;		/* remote smb uid  */
-	uid_t linux_uid;        /* local Linux uid */
+	uid_t linux_uid;        /* overriding owner of files on the mount */
+	uid_t cred_uid;		/* owner of credentials */
 	int capabilities;
 	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
 				TCP names - will ipv6 and sctp addresses fit? */
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	struct session_key auth_key;
+	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
@@ -311,10 +322,52 @@ struct cifsTconInfo {
 	bool local_lease:1; /* check leases (only) on local system not remote */
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_FSCACHE
+	u64 resource_id;		/* server resource id */
+	struct fscache_cookie *fscache;	/* cookie for share */
+#endif
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
 /*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+	unsigned long		tl_index;
+	unsigned long		tl_flags;
+#define TCON_LINK_MASTER	0
+#define TCON_LINK_PENDING	1
+#define TCON_LINK_IN_TREE	2
+	unsigned long		tl_time;
+	atomic_t		tl_count;
+	struct cifsTconInfo	*tl_tcon;
+};
+
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
+
+static inline struct cifsTconInfo *
+tlink_tcon(struct tcon_link *tlink)
+{
+	return tlink->tl_tcon;
+}
+
+extern void cifs_put_tlink(struct tcon_link *tlink);
+
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+	if (tlink && !IS_ERR(tlink))
+		atomic_inc(&tlink->tl_count);
+	return tlink;
+}
+
+/* This function is always expected to succeed */
+extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+
+/*
  * This info hangs off the cifsFileInfo structure, pointed to by llist.
  * This is used to track byte stream locks on the file
  */
@@ -352,34 +405,29 @@ struct cifsFileInfo {
 	__u16 netfid;		/* file id from remote */
 	/* BB add lock scope info here if needed */ ;
 	/* lock scope id (0 if none) */
-	struct file *pfile; /* needed for writepage */
-	struct inode *pInode; /* needed for oplock break */
-	struct vfsmount *mnt;
+	struct dentry *dentry;
+	unsigned int f_flags;
+	struct tcon_link *tlink;
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
-	bool closePend:1;	/* file is marked to close */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool oplock_break_cancelled:1;
-	atomic_t count;		/* reference count */
+	int count;		/* refcount protected by cifs_file_list_lock */
 	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
-	struct slow_work oplock_break; /* slow_work job for oplock breaks */
+	struct work_struct oplock_break; /* work for oplock breaks */
 };
 
-/* Take a reference on the file private data */
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
 static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 {
-	atomic_inc(&cifs_file->count);
+	++cifs_file->count;
 }
 
-/* Release a reference on the file private data */
-static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
-{
-	if (atomic_dec_and_test(&cifs_file->count)) {
-		iput(cifs_file->pInode);
-		kfree(cifs_file);
-	}
-}
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
 /*
  * One of these for each file inode
@@ -389,7 +437,6 @@ struct cifsInodeInfo {
 	struct list_head lockList;
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
-	int write_behind_rc;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
 	unsigned long time;	/* jiffies of last update/check of inode */
 	bool clientCanCacheRead:1;	/* read oplock */
@@ -398,6 +445,9 @@ struct cifsInodeInfo {
 	bool invalid_mapping:1;		/* pagecache is invalid */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
+#ifdef CONFIG_CIFS_FSCACHE
+	struct fscache_cookie *fscache;
+#endif
 	struct inode vfs_inode;
 };
 
@@ -478,16 +528,16 @@ struct oplock_q_entry {
 
 /* for pending dnotify requests */
 struct dir_notify_req {
-       struct list_head lhead;
-       __le16 Pid;
-       __le16 PidHigh;
-       __u16 Mid;
-       __u16 Tid;
-       __u16 Uid;
-       __u16 netfid;
-       __u32 filter; /* CompletionFilter (for multishot) */
-       int multishot;
-       struct file *pfile;
+	struct list_head lhead;
+	__le16 Pid;
+	__le16 PidHigh;
+	__u16 Mid;
+	__u16 Tid;
+	__u16 Uid;
+	__u16 netfid;
+	__u32 filter; /* CompletionFilter (for multishot) */
+	int multishot;
+	struct file *pfile;
 };
 
 struct dfs_info3_param {
@@ -637,7 +687,7 @@ require use of the stronger protocol */
  *  GlobalMid_Lock protects:
  *	list operations on pending_mid_q and oplockQ
  *      updates to XID counters, multiplex id  and SMB sequence numbers
- *  GlobalSMBSesLock protects:
+ *  cifs_file_list_lock protects:
  *	list operations on tcp and SMB session lists and tCon lists
  *  f_owner.lock protects certain per file struct operations
  *  mapping->page_lock protects certain per page operations
@@ -671,7 +721,7 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
  * the reference counters for the server, smb session, and tcon. Finally,
  * changes to the tcon->tidStatus should be done while holding this lock.
  */
-GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
+GLOBAL_EXTERN spinlock_t		cifs_tcp_ses_lock;
 
 /*
  * This lock protects the cifs_file->llist and cifs_file->flist
@@ -680,7 +730,7 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
  * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
  * the cifs_tcp_ses_lock must be grabbed first and released last.
  */
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
+GLOBAL_EXTERN spinlock_t	cifs_file_list_lock;
 
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
@@ -732,4 +782,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 
+void cifs_oplock_break(struct work_struct *work);
+void cifs_oplock_break_get(struct cifsFileInfo *cfile);
+void cifs_oplock_break_put(struct cifsFileInfo *cfile);
+
 extern const struct slow_work_ops cifs_oplock_break_ops;
+
+#endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db1..de36b09763a 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -131,9 +131,20 @@
 #define CIFS_CRYPTO_KEY_SIZE (8)
 
 /*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+
+/*
  * Size of the session key (crypto key encrypted with the password
  */
-#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (16)
+
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 
 /*
  * Maximum user name length
@@ -663,7 +674,6 @@ struct ntlmv2_resp {
 	__le64  time;
 	__u64  client_chal; /* random */
 	__u32  reserved2;
-	struct ntlmssp2_name names[2];
 	/* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fb6318b8150..edb6d90efdf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,15 +78,18 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 #endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			struct TCP_Server_Info *server);
-extern int cifs_convert_address(char *src, void *dst);
+extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+				const unsigned short int port);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
 			    const struct cifsTconInfo *, int /* length of
@@ -102,12 +105,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
 				      int offset);
 
-extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
-				__u16 fileHandle, struct file *file,
-				struct vfsmount *mnt, unsigned int oflags);
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
+				struct file *file, struct tcon_link *tlink,
+				__u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
-				int mode, int oflags,
+				int mode, unsigned int f_flags,
 				__u32 *poplock, __u16 *pnetfid, int xid);
 void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -359,15 +362,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-				 const struct mac_key *mac_key,
+				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
-				 const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
-			const struct nls_table *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
-			     const struct nls_table *);
+extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int setup_ntlm_response(struct cifsSesInfo *);
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifsSesInfo *);
+
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
@@ -407,4 +410,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		const unsigned char *path,
+		struct cifs_sb_info *cifs_sb, int xid);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd3..2f2632b6df5 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	struct list_head *tmp1;
 
 /* list all files open on tree connection and mark them invalid */
-	write_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
 		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
 		open_file->invalidHandle = true;
 		open_file->oplock_break_cancelled = true;
 	}
-	write_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
 	   to this tcon */
 }
@@ -232,7 +232,7 @@ static int
 small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		void **request_buf)
 {
-	int rc = 0;
+	int rc;
 
 	rc = cifs_reconnect_tcon(tcon, smb_command);
 	if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	if (tcon != NULL)
 		cifs_stats_inc(&tcon->num_smbs_sent);
 
-	return rc;
+	return 0;
 }
 
 int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-	 void **request_buf /* returned */ ,
-	 void **response_buf /* returned */ )
+__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+			void **request_buf, void **response_buf)
 {
-	int rc = 0;
-
-	rc = cifs_reconnect_tcon(tcon, smb_command);
-	if (rc)
-		return rc;
-
 	*request_buf = cifs_buf_get();
 	if (*request_buf == NULL) {
 		/* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	if (tcon != NULL)
 		cifs_stats_inc(&tcon->num_smbs_sent);
 
-	return rc;
+	return 0;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+	 void **request_buf, void **response_buf)
+{
+	int rc;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
+	if (rc)
+		return rc;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+			void **request_buf, void **response_buf)
+{
+	if (tcon->ses->need_reconnect || tcon->need_reconnect)
+		return -EHOSTDOWN;
+
+	return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
 }
 
 static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -486,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 		if (rsp->EncryptionKeyLength ==
 				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-			memcpy(server->cryptKey, rsp->EncryptionKey,
+			memcpy(ses->server->cryptkey, rsp->EncryptionKey,
 				CIFS_CRYPTO_KEY_SIZE);
 		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
 			rc = -EIO; /* need cryptkey unless plain text */
@@ -557,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-		memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
 	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
 			&& (pSMBr->EncryptionKeyLength == 0)) {
@@ -576,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			rc = -EIO;
 			goto neg_err_exit;
 		}
-		read_lock(&cifs_tcp_ses_lock);
+		spin_lock(&cifs_tcp_ses_lock);
 		if (server->srv_count > 1) {
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
@@ -588,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 					16);
 			}
 		} else {
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			memcpy(server->server_GUID,
 			       pSMBr->u.extended_response.GUID, 16);
 		}
@@ -603,13 +620,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 				rc = 0;
 			else
 				rc = -EINVAL;
-
-			if (server->sec_kerberos || server->sec_mskerberos)
-				server->secType = Kerberos;
-			else if (server->sec_ntlmssp)
-				server->secType = RawNTLMSSP;
-			else
-				rc = -EOPNOTSUPP;
+			if (server->secType == Kerberos) {
+				if (!server->sec_kerberos &&
+						!server->sec_mskerberos)
+					rc = -EOPNOTSUPP;
+			} else if (server->secType == RawNTLMSSP) {
+				if (!server->sec_ntlmssp)
+					rc = -EOPNOTSUPP;
+			} else
+					rc = -EOPNOTSUPP;
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -4534,8 +4553,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
 
 	cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+				   (void **) &pSMB, (void **) &pSMBr);
 	if (rc)
 		return rc;
 
@@ -4604,8 +4623,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
 	cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
 	/* BB switch to small buf init to save memory */
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
+	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+					(void **) &pSMB, (void **) &pSMBr);
 	if (rc)
 		return rc;
 
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb..00000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *   fs/cifs/cn_cifs.h
- *
- *   Copyright (c) International Business Machines  Corp., 2002
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _CN_CIFS_H
-#define _CN_CIFS_H
-#ifdef CONFIG_CIFS_UPCALL
-#include <linux/types.h>
-#include <linux/connector.h>
-
-struct cifs_upcall {
-	char signature[4]; /* CIFS */
-	enum command {
-		CIFS_GET_IP = 0x00000001,   /* get ip address for hostname */
-		CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
-	} command;
-	/* union cifs upcall data follows */
-};
-#endif /* CIFS_UPCALL */
-#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2208f06e4c4..9eb327defa1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
-#include "cn_cifs.h"
+#include "fscache.h"
 
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -66,6 +66,7 @@ struct smb_vol {
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+	uid_t cred_uid;
 	uid_t linux_uid;
 	gid_t linux_gid;
 	mode_t file_mode;
@@ -97,16 +98,25 @@ struct smb_vol {
 	bool noblocksnd:1;
 	bool noautotune:1;
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+	bool fsc:1;	/* enable fscache */
+	bool mfsymlinks:1; /* use Minshall+French Symlinks */
+	bool multiuser:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	char *prepath;
+	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
 };
 
+/* FIXME: should these be tunable? */
+#define TLINK_ERROR_EXPIRE	(1 * HZ)
+#define TLINK_IDLE_EXPIRE	(600 * HZ)
+
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
+static void cifs_prune_tlinks(struct work_struct *work);
 
 /*
  * cifs tcp session reconnection
@@ -140,7 +150,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->need_reconnect = true;
@@ -150,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			tcon->need_reconnect = true;
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	/* do not want to be sending data on a socket we are freeing */
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
@@ -163,6 +173,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
+	server->sequence_number = 0;
+	server->session_estab = false;
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
 
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
@@ -195,7 +210,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			spin_lock(&GlobalMid_Lock);
 			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
-			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);
 	/*		atomic_set(&server->inFlight,0);*/
 			wake_up(&server->response_q);
@@ -397,7 +411,9 @@ incomplete_rcv:
 			cFYI(1, "call to reconnect done");
 			csocket = server->ssocket;
 			continue;
-		} else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
+		} else if (length == -ERESTARTSYS ||
+			   length == -EAGAIN ||
+			   length == -EINTR) {
 			msleep(1); /* minimum sleep to prevent looping
 				allowing socket to clear and app threads to set
 				tcpStatus CifsNeedReconnect if server hung */
@@ -411,18 +427,6 @@ incomplete_rcv:
 			} else
 				continue;
 		} else if (length <= 0) {
-			if (server->tcpStatus == CifsNew) {
-				cFYI(1, "tcp session abend after SMBnegprot");
-				/* some servers kill the TCP session rather than
-				   returning an SMB negprot error, in which
-				   case reconnecting here is not going to help,
-				   and so simply return error to mount */
-				break;
-			}
-			if (!try_to_freeze() && (length == -EINTR)) {
-				cFYI(1, "cifsd thread killed");
-				break;
-			}
 			cFYI(1, "Reconnect after unexpected peek error %d",
 				length);
 			cifs_reconnect(server);
@@ -463,27 +467,19 @@ incomplete_rcv:
 			   an error on SMB negprot response */
 			cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
 				pdu_length);
-			if (server->tcpStatus == CifsNew) {
-				/* if nack on negprot (rather than
-				ret of smb negprot error) reconnecting
-				not going to help, ret error to mount */
-				break;
-			} else {
-				/* give server a second to
-				clean up before reconnect attempt */
-				msleep(1000);
-				/* always try 445 first on reconnect
-				since we get NACK on some if we ever
-				connected to port 139 (the NACK is
-				since we do not begin with RFC1001
-				session initialize frame) */
-				server->addr.sockAddr.sin_port =
-					htons(CIFS_PORT);
-				cifs_reconnect(server);
-				csocket = server->ssocket;
-				wake_up(&server->response_q);
-				continue;
-			}
+			/* give server a second to clean up  */
+			msleep(1000);
+			/* always try 445 first on reconnect since we get NACK
+			 * on some if we ever connected to port 139 (the NACK
+			 * is since we do not begin with RFC1001 session
+			 * initialize frame)
+			 */
+			cifs_set_port((struct sockaddr *)
+					&server->addr.sockAddr, CIFS_PORT);
+			cifs_reconnect(server);
+			csocket = server->ssocket;
+			wake_up(&server->response_q);
+			continue;
 		} else if (temp != (char) 0) {
 			cERROR(1, "Unknown RFC 1002 frame");
 			cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
@@ -519,8 +515,7 @@ incomplete_rcv:
 		     total_read += length) {
 			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
 						pdu_length - total_read, 0);
-			if ((server->tcpStatus == CifsExiting) ||
-			    (length == -EINTR)) {
+			if (server->tcpStatus == CifsExiting) {
 				/* then will exit */
 				reconnect = 2;
 				break;
@@ -531,8 +526,9 @@ incomplete_rcv:
 				/* Now we will reread sock */
 				reconnect = 1;
 				break;
-			} else if ((length == -ERESTARTSYS) ||
-				   (length == -EAGAIN)) {
+			} else if (length == -ERESTARTSYS ||
+				   length == -EAGAIN ||
+				   length == -EINTR) {
 				msleep(1); /* minimum sleep to prevent looping,
 					      allowing socket to clear and app
 					      threads to set tcpStatus
@@ -644,9 +640,9 @@ multi_t2_fnd:
 	} /* end while !EXITING */
 
 	/* take it off the list, if it's not already */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_del_init(&server->tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
@@ -684,7 +680,7 @@ multi_t2_fnd:
 	 * BB: we shouldn't have to do any of this. It shouldn't be
 	 * possible to exit from the thread with active SMB sessions
 	 */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (list_empty(&server->pending_mid_q)) {
 		/* loop through server session structures attached to this and
 		    mark them dead */
@@ -694,7 +690,7 @@ multi_t2_fnd:
 			ses->status = CifsExiting;
 			ses->server = NULL;
 		}
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 	} else {
 		/* although we can not zero the server struct pointer yet,
 		since there are active requests which may depnd on them,
@@ -717,7 +713,7 @@ multi_t2_fnd:
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		read_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -740,12 +736,12 @@ multi_t2_fnd:
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
 	/* BB: This shouldn't be necessary, see above */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->server = NULL;
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
@@ -830,7 +826,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 	/* null target name indicates to use *SMBSERVR default called name
 	   if we end up sending RFC1001 session initialize */
 	vol->target_rfc1001_name[0] = 0;
-	vol->linux_uid = current_uid();  /* use current_euid() instead? */
+	vol->cred_uid = current_uid();
+	vol->linux_uid = current_uid();
 	vol->linux_gid = current_gid();
 
 	/* default to only allowing write access to owner of the mount */
@@ -1060,6 +1057,22 @@ cifs_parse_mount_options(char *options, const char *devname,
 						    "long\n");
 				return 1;
 			}
+		} else if (strnicmp(data, "srcaddr", 7) == 0) {
+			vol->srcaddr.ss_family = AF_UNSPEC;
+
+			if (!value || !*value) {
+				printk(KERN_WARNING "CIFS: srcaddr value"
+				       " not specified.\n");
+				return 1;	/* needs_arg; */
+			}
+			i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
+						 value, strlen(value));
+			if (i == 0) {
+				printk(KERN_WARNING "CIFS:  Could not parse"
+				       " srcaddr: %s\n",
+				       value);
+				return 1;
+			}
 		} else if (strnicmp(data, "prefixpath", 10) == 0) {
 			if (!value || !*value) {
 				printk(KERN_WARNING
@@ -1257,6 +1270,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if ((strnicmp(data, "nocase", 6) == 0) ||
 			   (strnicmp(data, "ignorecase", 10)  == 0)) {
 			vol->nocase = 1;
+		} else if (strnicmp(data, "mand", 4) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "nomand", 6) == 0) {
+			/* ignore */
+		} else if (strnicmp(data, "_netdev", 7) == 0) {
+			/* ignore */
 		} else if (strnicmp(data, "brl", 3) == 0) {
 			vol->nobrl =  0;
 		} else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1331,6 +1350,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
 				"/proc/fs/cifs/LookupCacheEnabled to 0\n");
+		} else if (strnicmp(data, "fsc", 3) == 0) {
+			vol->fsc = true;
+		} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+			vol->mfsymlinks = true;
+		} else if (strnicmp(data, "multiuser", 8) == 0) {
+			vol->multiuser = true;
 		} else
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
 						data);
@@ -1362,6 +1387,13 @@ cifs_parse_mount_options(char *options, const char *devname,
 			return 1;
 		}
 	}
+
+	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+		cERROR(1, "Multiuser mounts currently require krb5 "
+			  "authentication!");
+		return 1;
+	}
+
 	if (vol->UNCip == NULL)
 		vol->UNCip = &vol->UNC[2];
 
@@ -1380,65 +1412,136 @@ cifs_parse_mount_options(char *options, const char *devname,
 	return 0;
 }
 
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+	switch (srcaddr->sa_family) {
+	case AF_UNSPEC:
+		return (rhs->sa_family == AF_UNSPEC);
+	case AF_INET: {
+		struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+		struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+		return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+		struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+		return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+	}
+	default:
+		WARN_ON(1);
+		return false; /* don't expect to be here */
+	}
+}
+
+
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+	      struct sockaddr *srcaddr)
+{
+	struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (addr4->sin_addr.s_addr !=
+		    server->addr.sockAddr.sin_addr.s_addr)
+			return false;
+		if (addr4->sin_port &&
+		    addr4->sin_port != server->addr.sockAddr.sin_port)
+			return false;
+		break;
+	case AF_INET6:
+		if (!ipv6_addr_equal(&addr6->sin6_addr,
+				     &server->addr.sockAddr6.sin6_addr))
+			return false;
+		if (addr6->sin6_scope_id !=
+		    server->addr.sockAddr6.sin6_scope_id)
+			return false;
+		if (addr6->sin6_port &&
+		    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
+			return false;
+		break;
+	}
+
+	if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+		return false;
+
+	return true;
+}
+
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+	unsigned int secFlags;
+
+	if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+		secFlags = vol->secFlg;
+	else
+		secFlags = global_secflags | vol->secFlg;
+
+	switch (server->secType) {
+	case LANMAN:
+		if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
+			return false;
+		break;
+	case NTLMv2:
+		if (!(secFlags & CIFSSEC_MAY_NTLMV2))
+			return false;
+		break;
+	case NTLM:
+		if (!(secFlags & CIFSSEC_MAY_NTLM))
+			return false;
+		break;
+	case Kerberos:
+		if (!(secFlags & CIFSSEC_MAY_KRB5))
+			return false;
+		break;
+	case RawNTLMSSP:
+		if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
+			return false;
+		break;
+	default:
+		/* shouldn't happen */
+		return false;
+	}
+
+	/* now check if signing mode is acceptible */
+	if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+	    (server->secMode & SECMODE_SIGN_REQUIRED))
+			return false;
+	else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+		 (server->secMode &
+		  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+			return false;
+
+	return true;
+}
+
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
+cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
-	struct list_head *tmp;
 	struct TCP_Server_Info *server;
-	struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
-	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
-
-	write_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &cifs_tcp_ses_list) {
-		server = list_entry(tmp, struct TCP_Server_Info,
-				    tcp_ses_list);
-		/*
-		 * the demux thread can exit on its own while still in CifsNew
-		 * so don't accept any sockets in that state. Since the
-		 * tcpStatus never changes back to CifsNew it's safe to check
-		 * for this without a lock.
-		 */
-		if (server->tcpStatus == CifsNew)
-			continue;
 
-		switch (addr->ss_family) {
-		case AF_INET:
-			if (addr4->sin_addr.s_addr ==
-			    server->addr.sockAddr.sin_addr.s_addr) {
-				addr4->sin_port = htons(port);
-				/* user overrode default port? */
-				if (addr4->sin_port) {
-					if (addr4->sin_port !=
-					    server->addr.sockAddr.sin_port)
-						continue;
-				}
-				break;
-			} else
-				continue;
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+		if (!match_address(server, addr,
+				   (struct sockaddr *)&vol->srcaddr))
+			continue;
 
-		case AF_INET6:
-			if (ipv6_addr_equal(&addr6->sin6_addr,
-			    &server->addr.sockAddr6.sin6_addr) &&
-			    (addr6->sin6_scope_id ==
-			    server->addr.sockAddr6.sin6_scope_id)) {
-				addr6->sin6_port = htons(port);
-				/* user overrode default port? */
-				if (addr6->sin6_port) {
-					if (addr6->sin6_port !=
-					   server->addr.sockAddr6.sin6_port)
-						continue;
-				}
-				break;
-			} else
-				continue;
-		}
+		if (!match_security(server, vol))
+			continue;
 
 		++server->srv_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		cFYI(1, "Existing tcp session with server found");
 		return server;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1447,19 +1550,26 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
 	struct task_struct *task;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--server->srv_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&server->tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
 
+	cifs_crypto_shash_release(server);
+	cifs_fscache_release_client_cookie(server);
+
+	kfree(server->session_key.response);
+	server->session_key.response = NULL;
+	server->session_key.len = 0;
+
 	task = xchg(&server->tsk, NULL);
 	if (task)
 		force_sig(SIGKILL, task);
@@ -1479,7 +1589,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
 
 	if (volume_info->UNCip && volume_info->UNC) {
-		rc = cifs_convert_address(volume_info->UNCip, &addr);
+		rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+					volume_info->UNCip,
+					strlen(volume_info->UNCip),
+					volume_info->port);
 		if (!rc) {
 			/* we failed translating address */
 			rc = -EINVAL;
@@ -1499,7 +1612,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 
 	/* see if we already have a matching tcp_ses */
-	tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
+	tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
 	if (tcp_ses)
 		return tcp_ses;
 
@@ -1509,10 +1622,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
+	rc = cifs_crypto_shash_allocate(tcp_ses);
+	if (rc) {
+		cERROR(1, "could not setup hash structures rc %d", rc);
+		goto out_err;
+	}
+
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
 		rc = PTR_ERR(tcp_ses->hostname);
-		goto out_err;
+		goto out_err_crypto_release;
 	}
 
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1527,6 +1646,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
 	memcpy(tcp_ses->server_RFC1001_name,
 		volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+	tcp_ses->session_estab = false;
 	tcp_ses->sequence_number = 0;
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -1537,25 +1657,25 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	 * no need to spinlock this init of tcpStatus or srv_count
 	 */
 	tcp_ses->tcpStatus = CifsNew;
+	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+	       sizeof(tcp_ses->srcaddr));
 	++tcp_ses->srv_count;
 
 	if (addr.ss_family == AF_INET6) {
 		cFYI(1, "attempting ipv6 connect");
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
-		sin_server6->sin6_port = htons(volume_info->port);
 		memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
 			sizeof(struct sockaddr_in6));
 		rc = ipv6_connect(tcp_ses);
 	} else {
-		sin_server->sin_port = htons(volume_info->port);
 		memcpy(&tcp_ses->addr.sockAddr, sin_server,
 			sizeof(struct sockaddr_in));
 		rc = ipv4_connect(tcp_ses);
 	}
 	if (rc < 0) {
 		cERROR(1, "Error connecting to socket. Aborting operation");
-		goto out_err;
+		goto out_err_crypto_release;
 	}
 
 	/*
@@ -1569,16 +1689,21 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		rc = PTR_ERR(tcp_ses->tsk);
 		cERROR(1, "error %d create cifsd thread", rc);
 		module_put(THIS_MODULE);
-		goto out_err;
+		goto out_err_crypto_release;
 	}
 
 	/* thread spawned, put it on the list */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_client_cookie(tcp_ses);
 
 	return tcp_ses;
 
+out_err_crypto_release:
+	cifs_crypto_shash_release(tcp_ses);
+
 out_err:
 	if (tcp_ses) {
 		if (!IS_ERR(tcp_ses->hostname))
@@ -1591,22 +1716,34 @@ out_err:
 }
 
 static struct cifsSesInfo *
-cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-	struct list_head *tmp;
 	struct cifsSesInfo *ses;
 
-	write_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &server->smb_ses_list) {
-		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-		if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
-			continue;
-
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		switch (server->secType) {
+		case Kerberos:
+			if (vol->cred_uid != ses->cred_uid)
+				continue;
+			break;
+		default:
+			/* anything else takes username/password */
+			if (strncmp(ses->userName, vol->username,
+				    MAX_USERNAME_SIZE))
+				continue;
+			if (strlen(vol->username) != 0 &&
+			    ses->password != NULL &&
+			    strncmp(ses->password,
+				    vol->password ? vol->password : "",
+				    MAX_PASSWORD_SIZE))
+				continue;
+		}
 		++ses->ses_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return ses;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1617,14 +1754,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	struct TCP_Server_Info *server = ses->server;
 
 	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--ses->ses_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&ses->smb_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	if (ses->status == CifsGood) {
 		xid = GetXid();
@@ -1643,13 +1780,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 
 	xid = GetXid();
 
-	ses = cifs_find_smb_ses(server, volume_info->username);
+	ses = cifs_find_smb_ses(server, volume_info);
 	if (ses) {
 		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
 
-		/* existing SMB ses has a server reference already */
-		cifs_put_tcp_session(server);
-
 		mutex_lock(&ses->session_mutex);
 		rc = cifs_negotiate_protocol(xid, ses);
 		if (rc) {
@@ -1672,6 +1806,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 			}
 		}
 		mutex_unlock(&ses->session_mutex);
+
+		/* existing SMB ses has a server reference already */
+		cifs_put_tcp_session(server);
 		FreeXid(xid);
 		return ses;
 	}
@@ -1701,11 +1838,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 			goto get_ses_fail;
 	}
 	if (volume_info->domainname) {
-		int len = strlen(volume_info->domainname);
-		ses->domainName = kmalloc(len + 1, GFP_KERNEL);
-		if (ses->domainName)
-			strcpy(ses->domainName, volume_info->domainname);
+		ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
+		if (!ses->domainName)
+			goto get_ses_fail;
 	}
+	ses->cred_uid = volume_info->cred_uid;
 	ses->linux_uid = volume_info->linux_uid;
 	ses->overrideSecFlg = volume_info->secFlg;
 
@@ -1718,9 +1855,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		goto get_ses_fail;
 
 	/* success, put it on the list */
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&ses->smb_ses_list, &server->smb_ses_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	FreeXid(xid);
 	return ses;
@@ -1737,7 +1874,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 	struct list_head *tmp;
 	struct cifsTconInfo *tcon;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &ses->tcon_list) {
 		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
 		if (tcon->tidStatus == CifsExiting)
@@ -1746,10 +1883,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 			continue;
 
 		++tcon->tc_count;
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return tcon;
 	}
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
@@ -1760,19 +1897,20 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	struct cifsSesInfo *ses = tcon->ses;
 
 	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
-		write_unlock(&cifs_tcp_ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
 
 	list_del_init(&tcon->tcon_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	xid = GetXid();
 	CIFSSMBTDis(xid, tcon);
 	_FreeXid(xid);
 
+	cifs_fscache_release_super_cookie(tcon);
 	tconInfoFree(tcon);
 	cifs_put_smb_ses(ses);
 }
@@ -1839,9 +1977,11 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
 	tcon->nocase = volume_info->nocase;
 	tcon->local_lease = volume_info->local_lease;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_add(&tcon->tcon_list, &ses->tcon_list);
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	cifs_fscache_get_super_cookie(tcon);
 
 	return tcon;
 
@@ -1850,6 +1990,23 @@ out_fail:
 	return ERR_PTR(rc);
 }
 
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+	if (!tlink || IS_ERR(tlink))
+		return;
+
+	if (!atomic_dec_and_test(&tlink->tl_count) ||
+	    test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+		tlink->tl_time = jiffies;
+		return;
+	}
+
+	if (!IS_ERR(tlink_tcon(tlink)))
+		cifs_put_tcon(tlink_tcon(tlink));
+	kfree(tlink);
+	return;
+}
 
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1934,6 +2091,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 
 }
 
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+	int rc = 0;
+	if (server->srcaddr.ss_family != AF_UNSPEC) {
+		/* Bind to the specified local IP address */
+		struct socket *socket = server->ssocket;
+		rc = socket->ops->bind(socket,
+				       (struct sockaddr *) &server->srcaddr,
+				       sizeof(server->srcaddr));
+		if (rc < 0) {
+			struct sockaddr_in *saddr4;
+			struct sockaddr_in6 *saddr6;
+			saddr4 = (struct sockaddr_in *)&server->srcaddr;
+			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+			if (saddr6->sin6_family == AF_INET6)
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI6c, error: %d\n",
+				       &saddr6->sin6_addr, rc);
+			else
+				cERROR(1, "cifs: "
+				       "Failed to bind to: %pI4, error: %d\n",
+				       &saddr4->sin_addr.s_addr, rc);
+		}
+	}
+	return rc;
+}
 
 static int
 ipv4_connect(struct TCP_Server_Info *server)
@@ -1959,6 +2143,10 @@ ipv4_connect(struct TCP_Server_Info *server)
 		cifs_reclassify_socket4(socket);
 	}
 
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
 	/* user overrode default port */
 	if (server->addr.sockAddr.sin_port) {
 		rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2121,6 +2309,10 @@ ipv6_connect(struct TCP_Server_Info *server)
 		cifs_reclassify_socket6(socket);
 	}
 
+	rc = bind_socket(server);
+	if (rc < 0)
+		return rc;
+
 	/* user overrode default port */
 	if (server->addr.sockAddr6.sin6_port) {
 		rc = socket->ops->connect(socket,
@@ -2320,6 +2512,8 @@ convert_delimiter(char *path, char delim)
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
+	INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+
 	if (pvolume_info->rsize > CIFSMaxBufSize) {
 		cERROR(1, "rsize %d too large, using MaxBufSize",
 			pvolume_info->rsize);
@@ -2397,10 +2591,23 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
 	if (pvolume_info->dynperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->fsc)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+	if (pvolume_info->multiuser)
+		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+					    CIFS_MOUNT_NO_PERM);
 	if (pvolume_info->direct_io) {
 		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
+	if (pvolume_info->mfsymlinks) {
+		if (pvolume_info->sfu_emul) {
+			cERROR(1,  "mount option mfsymlinks ignored if sfu "
+				   "mount option is used");
+		} else {
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+		}
+	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
 		cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2487,6 +2694,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct TCP_Server_Info *srvTcp;
 	char   *full_path;
 	char *mount_data = mount_data_global;
+	struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals = 0;
@@ -2498,6 +2706,7 @@ try_mount_again:
 	pSesInfo = NULL;
 	srvTcp = NULL;
 	full_path = NULL;
+	tlink = NULL;
 
 	xid = GetXid();
 
@@ -2573,8 +2782,6 @@ try_mount_again:
 		goto remote_path_check;
 	}
 
-	cifs_sb->tcon = tcon;
-
 	/* do not care if following two calls succeed - informational */
 	if (!tcon->ipc) {
 		CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2683,6 +2890,38 @@ remote_path_check:
 #endif
 	}
 
+	if (rc)
+		goto mount_fail_check;
+
+	/* now, hang the tcon off of the superblock */
+	tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+	if (tlink == NULL) {
+		rc = -ENOMEM;
+		goto mount_fail_check;
+	}
+
+	tlink->tl_index = pSesInfo->linux_uid;
+	tlink->tl_tcon = tcon;
+	tlink->tl_time = jiffies;
+	set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+	set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+
+	rc = radix_tree_preload(GFP_KERNEL);
+	if (rc == -ENOMEM) {
+		kfree(tlink);
+		goto mount_fail_check;
+	}
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
+	radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
+			   CIFS_TLINK_MASTER_TAG);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+	radix_tree_preload_end();
+
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+
 mount_fail_check:
 	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
@@ -2760,14 +2999,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
 		    (ses->server->secType == LANMAN))
-			calc_lanman_hash(tcon->password, ses->server->cryptKey,
+			calc_lanman_hash(tcon->password, ses->server->cryptkey,
 					 ses->server->secMode &
 					    SECMODE_PW_ENCRYPT ? true : false,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
-		SMBNTencrypt(tcon->password, ses->server->cryptKey,
-			     bcc_ptr);
+		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
 
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
@@ -2869,19 +3107,39 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-	int rc = 0;
+	int i, ret;
 	char *tmp;
+	struct tcon_link *tlink[8];
+	unsigned long index = 0;
+
+	cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+
+	do {
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+					     (void **)tlink, index,
+					     ARRAY_SIZE(tlink));
+		/* increment index for next pass */
+		if (ret > 0)
+			index = tlink[ret - 1]->tl_index + 1;
+		for (i = 0; i < ret; i++) {
+			cifs_get_tlink(tlink[i]);
+			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+			radix_tree_delete(&cifs_sb->tlink_tree,
+							tlink[i]->tl_index);
+		}
+		spin_unlock(&cifs_sb->tlink_tree_lock);
 
-	if (cifs_sb->tcon)
-		cifs_put_tcon(cifs_sb->tcon);
+		for (i = 0; i < ret; i++)
+			cifs_put_tlink(tlink[i]);
+	} while (ret != 0);
 
-	cifs_sb->tcon = NULL;
 	tmp = cifs_sb->prepath;
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
 
-	return rc;
+	return 0;
 }
 
 int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2932,6 +3190,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 	if (rc) {
 		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
+		mutex_lock(&ses->server->srv_mutex);
+		if (!server->session_estab) {
+			server->session_key.response = ses->auth_key.response;
+			server->session_key.len = ses->auth_key.len;
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+			ses->auth_key.response = NULL;
+		}
+		mutex_unlock(&server->srv_mutex);
+
 		cFYI(1, "CIFS Session Established successfully");
 		spin_lock(&GlobalMid_Lock);
 		ses->status = CifsGood;
@@ -2939,6 +3207,246 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 		spin_unlock(&GlobalMid_Lock);
 	}
 
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	ses->auth_key.len = 0;
+	kfree(ses->ntlmssp);
+	ses->ntlmssp = NULL;
+
 	return rc;
 }
 
+static struct cifsTconInfo *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+{
+	struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+	struct cifsSesInfo *ses;
+	struct cifsTconInfo *tcon = NULL;
+	struct smb_vol *vol_info;
+	char username[MAX_USERNAME_SIZE + 1];
+
+	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+	if (vol_info == NULL) {
+		tcon = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
+	vol_info->username = username;
+	vol_info->local_nls = cifs_sb->local_nls;
+	vol_info->linux_uid = fsuid;
+	vol_info->cred_uid = fsuid;
+	vol_info->UNC = master_tcon->treeName;
+	vol_info->retry = master_tcon->retry;
+	vol_info->nocase = master_tcon->nocase;
+	vol_info->local_lease = master_tcon->local_lease;
+	vol_info->no_linux_ext = !master_tcon->unix_ext;
+
+	/* FIXME: allow for other secFlg settings */
+	vol_info->secFlg = CIFSSEC_MUST_KRB5;
+
+	/* get a reference for the same TCP session */
+	spin_lock(&cifs_tcp_ses_lock);
+	++master_tcon->ses->server->srv_count;
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+	if (IS_ERR(ses)) {
+		tcon = (struct cifsTconInfo *)ses;
+		cifs_put_tcp_session(master_tcon->ses->server);
+		goto out;
+	}
+
+	tcon = cifs_get_tcon(ses, vol_info);
+	if (IS_ERR(tcon)) {
+		cifs_put_smb_ses(ses);
+		goto out;
+	}
+
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+	kfree(vol_info);
+
+	return tcon;
+}
+
+static struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+	struct tcon_link *tlink;
+	unsigned int ret;
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
+					0, 1, CIFS_TLINK_MASTER_TAG);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	/* the master tcon should always be present */
+	if (ret == 0)
+		BUG();
+
+	return tlink;
+}
+
+struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+	return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+
+static int
+cifs_sb_tcon_pending_wait(void *unused)
+{
+	schedule();
+	return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the radix tree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+	int ret;
+	unsigned long fsuid = (unsigned long) current_fsuid();
+	struct tcon_link *tlink, *newtlink;
+
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+
+	spin_lock(&cifs_sb->tlink_tree_lock);
+	tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+	if (tlink)
+		cifs_get_tlink(tlink);
+	spin_unlock(&cifs_sb->tlink_tree_lock);
+
+	if (tlink == NULL) {
+		newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+		if (newtlink == NULL)
+			return ERR_PTR(-ENOMEM);
+		newtlink->tl_index = fsuid;
+		newtlink->tl_tcon = ERR_PTR(-EACCES);
+		set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+		set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+		cifs_get_tlink(newtlink);
+
+		ret = radix_tree_preload(GFP_KERNEL);
+		if (ret != 0) {
+			kfree(newtlink);
+			return ERR_PTR(ret);
+		}
+
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		/* was one inserted after previous search? */
+		tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+		if (tlink) {
+			cifs_get_tlink(tlink);
+			spin_unlock(&cifs_sb->tlink_tree_lock);
+			radix_tree_preload_end();
+			kfree(newtlink);
+			goto wait_for_construction;
+		}
+		ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+		radix_tree_preload_end();
+		if (ret) {
+			kfree(newtlink);
+			return ERR_PTR(ret);
+		}
+		tlink = newtlink;
+	} else {
+wait_for_construction:
+		ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+				  cifs_sb_tcon_pending_wait,
+				  TASK_INTERRUPTIBLE);
+		if (ret) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(ret);
+		}
+
+		/* if it's good, return it */
+		if (!IS_ERR(tlink->tl_tcon))
+			return tlink;
+
+		/* return error if we tried this already recently */
+		if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+			cifs_put_tlink(tlink);
+			return ERR_PTR(-EACCES);
+		}
+
+		if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+			goto wait_for_construction;
+	}
+
+	tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+	clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+	wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+
+	if (IS_ERR(tlink->tl_tcon)) {
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-EACCES);
+	}
+
+	return tlink;
+}
+
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+	struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+						    prune_tlinks.work);
+	struct tcon_link *tlink[8];
+	unsigned long now = jiffies;
+	unsigned long index = 0;
+	int i, ret;
+
+	do {
+		spin_lock(&cifs_sb->tlink_tree_lock);
+		ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+					     (void **)tlink, index,
+					     ARRAY_SIZE(tlink));
+		/* increment index for next pass */
+		if (ret > 0)
+			index = tlink[ret - 1]->tl_index + 1;
+		for (i = 0; i < ret; i++) {
+			if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
+			    atomic_read(&tlink[i]->tl_count) != 0 ||
+			    time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
+				       now)) {
+				tlink[i] = NULL;
+				continue;
+			}
+			cifs_get_tlink(tlink[i]);
+			clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+			radix_tree_delete(&cifs_sb->tlink_tree,
+					  tlink[i]->tl_index);
+		}
+		spin_unlock(&cifs_sb->tlink_tree_lock);
+
+		for (i = 0; i < ret; i++) {
+			if (tlink[i] != NULL)
+				cifs_put_tlink(tlink[i]);
+		}
+	} while (ret != 0);
+
+	queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+				TLINK_IDLE_EXPIRE);
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e7ae78b66fa..3840eddbfb7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
 	int dfsplen;
 	char *full_path;
 	char dirsep;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (direntry == NULL)
 		return NULL;  /* not much we can do if dentry is freed and
 		we need to reopen the file after it was closed implicitly
 		when the server crashed */
 
-	cifs_sb = CIFS_SB(direntry->d_sb);
 	dirsep = CIFS_DIR_SEP(cifs_sb);
 	pplen = cifs_sb->prepathlen;
-	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
 	/* BB test paths to Windows with '/' in the midst of prepath */
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		strncpy(full_path, tcon->treeName, dfsplen);
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
 			int i;
 			for (i = 0; i < dfsplen; i++) {
@@ -130,141 +130,6 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
-/*
- * When called with struct file pointer set to NULL, there is no way we could
- * update file->private_data, but getting it stuck on openFileList provides a
- * way to access it from cifs_fill_filedata and thereby set file->private_data
- * from cifs_open.
- */
-struct cifsFileInfo *
-cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
-		  struct file *file, struct vfsmount *mnt, unsigned int oflags)
-{
-	int oplock = 0;
-	struct cifsFileInfo *pCifsFile;
-	struct cifsInodeInfo *pCifsInode;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
-
-	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-	if (pCifsFile == NULL)
-		return pCifsFile;
-
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-
-	pCifsFile->netfid = fileHandle;
-	pCifsFile->pid = current->tgid;
-	pCifsFile->pInode = igrab(newinode);
-	pCifsFile->mnt = mnt;
-	pCifsFile->pfile = file;
-	pCifsFile->invalidHandle = false;
-	pCifsFile->closePend = false;
-	mutex_init(&pCifsFile->fh_mutex);
-	mutex_init(&pCifsFile->lock_mutex);
-	INIT_LIST_HEAD(&pCifsFile->llist);
-	atomic_set(&pCifsFile->count, 1);
-	slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
-
-	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
-	pCifsInode = CIFS_I(newinode);
-	if (pCifsInode) {
-		/* if readable file instance put first in list*/
-		if (oflags & FMODE_READ)
-			list_add(&pCifsFile->flist, &pCifsInode->openFileList);
-		else
-			list_add_tail(&pCifsFile->flist,
-				      &pCifsInode->openFileList);
-
-		if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-			pCifsInode->clientCanCacheAll = true;
-			pCifsInode->clientCanCacheRead = true;
-			cFYI(1, "Exclusive Oplock inode %p", newinode);
-		} else if ((oplock & 0xF) == OPLOCK_READ)
-				pCifsInode->clientCanCacheRead = true;
-	}
-	write_unlock(&GlobalSMBSeslock);
-
-	file->private_data = pCifsFile;
-
-	return pCifsFile;
-}
-
-int cifs_posix_open(char *full_path, struct inode **pinode,
-			struct super_block *sb, int mode, int oflags,
-			__u32 *poplock, __u16 *pnetfid, int xid)
-{
-	int rc;
-	FILE_UNIX_BASIC_INFO *presp_data;
-	__u32 posix_flags = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifs_fattr fattr;
-
-	cFYI(1, "posix open %s", full_path);
-
-	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-	if (presp_data == NULL)
-		return -ENOMEM;
-
-/* So far cifs posix extensions can only map the following flags.
-   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
-   so far we do not seem to need them, and we can treat them as local only */
-	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
-		(FMODE_READ | FMODE_WRITE))
-		posix_flags = SMB_O_RDWR;
-	else if (oflags & FMODE_READ)
-		posix_flags = SMB_O_RDONLY;
-	else if (oflags & FMODE_WRITE)
-		posix_flags = SMB_O_WRONLY;
-	if (oflags & O_CREAT)
-		posix_flags |= SMB_O_CREAT;
-	if (oflags & O_EXCL)
-		posix_flags |= SMB_O_EXCL;
-	if (oflags & O_TRUNC)
-		posix_flags |= SMB_O_TRUNC;
-	/* be safe and imply O_SYNC for O_DSYNC */
-	if (oflags & O_DSYNC)
-		posix_flags |= SMB_O_SYNC;
-	if (oflags & O_DIRECTORY)
-		posix_flags |= SMB_O_DIRECTORY;
-	if (oflags & O_NOFOLLOW)
-		posix_flags |= SMB_O_NOFOLLOW;
-	if (oflags & O_DIRECT)
-		posix_flags |= SMB_O_DIRECT;
-
-	mode &= ~current_umask();
-	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
-			pnetfid, presp_data, poplock, full_path,
-			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc)
-		goto posix_open_ret;
-
-	if (presp_data->Type == cpu_to_le32(-1))
-		goto posix_open_ret; /* open ok, caller does qpathinfo */
-
-	if (!pinode)
-		goto posix_open_ret; /* caller does not need info */
-
-	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
-
-	/* get new inode and set it up */
-	if (*pinode == NULL) {
-		cifs_fill_uniqueid(sb, &fattr);
-		*pinode = cifs_iget(sb, &fattr);
-		if (!*pinode) {
-			rc = -ENOMEM;
-			goto posix_open_ret;
-		}
-	} else {
-		cifs_fattr_to_inode(*pinode, &fattr);
-	}
-
-posix_open_ret:
-	kfree(presp_data);
-	return rc;
-}
-
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -297,6 +162,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *tcon;
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
@@ -306,22 +172,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
-
-	full_path = build_path_from_dentry(direntry);
-	if (full_path == NULL) {
-		rc = -ENOMEM;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
 		FreeXid(xid);
-		return rc;
+		return PTR_ERR(tlink);
 	}
+	tcon = tlink_tcon(tlink);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 
 	if (nd && (nd->flags & LOOKUP_OPEN))
-		oflags = nd->intent.open.flags;
+		oflags = nd->intent.open.file->f_flags;
 	else
-		oflags = FMODE_READ | SMB_O_CREAT;
+		oflags = O_RDONLY | O_CREAT;
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto cifs_create_out;
+	}
 
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -351,9 +221,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		/* if the file is going to stay open, then we
 		   need to set the desired access properly */
 		desiredAccess = 0;
-		if (oflags & FMODE_READ)
+		if (OPEN_FMODE(oflags) & FMODE_READ)
 			desiredAccess |= GENERIC_READ; /* is this too little? */
-		if (oflags & FMODE_WRITE)
+		if (OPEN_FMODE(oflags) & FMODE_WRITE)
 			desiredAccess |= GENERIC_WRITE;
 
 		if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -371,9 +241,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
-		kfree(full_path);
-		FreeXid(xid);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto cifs_create_out;
 	}
 
 	/*
@@ -383,7 +252,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
-	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -475,8 +344,7 @@ cifs_create_set_dentry:
 			goto cifs_create_out;
 		}
 
-		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-					       nd->path.mnt, oflags);
+		pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
 		if (pfile_info == NULL) {
 			fput(filp);
 			CIFSSMBClose(xid, tcon, fileHandle);
@@ -489,6 +357,7 @@ cifs_create_set_dentry:
 cifs_create_out:
 	kfree(buf);
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return rc;
 }
@@ -499,22 +368,35 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	int rc = -EPERM;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
+	int oplock = 0;
+	u16 fileHandle;
+	FILE_ALL_INFO *buf = NULL;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
 
 	if (!old_valid_dev(device_number))
 		return -EINVAL;
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
-	if (full_path == NULL)
+	if (full_path == NULL) {
 		rc = -ENOMEM;
-	else if (pTcon->unix_ext) {
+		goto mknod_out;
+	}
+
+	if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
 			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
@@ -533,88 +415,80 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc)
+			goto mknod_out;
 
-		if (!rc) {
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
 						inode->i_sb, xid);
-			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
-			else
-				direntry->d_op = &cifs_dentry_ops;
-			if (rc == 0)
-				d_instantiate(direntry, newinode);
-		}
-	} else {
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-			int oplock = 0;
-			u16 fileHandle;
-			FILE_ALL_INFO *buf;
+		if (pTcon->nocase)
+			direntry->d_op = &cifs_ci_dentry_ops;
+		else
+			direntry->d_op = &cifs_dentry_ops;
 
-			cFYI(1, "sfu compat create special file");
+		if (rc == 0)
+			d_instantiate(direntry, newinode);
+		goto mknod_out;
+	}
 
-			buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-			if (buf == NULL) {
-				kfree(full_path);
-				rc = -ENOMEM;
-				FreeXid(xid);
-				return rc;
-			}
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto mknod_out;
 
-			rc = CIFSSMBOpen(xid, pTcon, full_path,
-					 FILE_CREATE, /* fail if exists */
-					 GENERIC_WRITE /* BB would
-					  WRITE_OWNER | WRITE_DAC be better? */,
-					 /* Create a file and set the
-					    file attribute to SYSTEM */
-					 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
-					 &fileHandle, &oplock, buf,
-					 cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-			/* BB FIXME - add handling for backlevel servers
-			   which need legacy open and check for all
-			   calls to SMBOpen for fallback to SMBLeagcyOpen */
-			if (!rc) {
-				/* BB Do not bother to decode buf since no
-				   local inode yet to put timestamps in,
-				   but we can reuse it safely */
-				unsigned int bytes_written;
-				struct win_dev *pdev;
-				pdev = (struct win_dev *)buf;
-				if (S_ISCHR(mode)) {
-					memcpy(pdev->type, "IntxCHR", 8);
-					pdev->major =
-					      cpu_to_le64(MAJOR(device_number));
-					pdev->minor =
-					      cpu_to_le64(MINOR(device_number));
-					rc = CIFSSMBWrite(xid, pTcon,
-						fileHandle,
-						sizeof(struct win_dev),
-						0, &bytes_written, (char *)pdev,
-						NULL, 0);
-				} else if (S_ISBLK(mode)) {
-					memcpy(pdev->type, "IntxBLK", 8);
-					pdev->major =
-					      cpu_to_le64(MAJOR(device_number));
-					pdev->minor =
-					      cpu_to_le64(MINOR(device_number));
-					rc = CIFSSMBWrite(xid, pTcon,
-						fileHandle,
-						sizeof(struct win_dev),
-						0, &bytes_written, (char *)pdev,
-						NULL, 0);
-				} /* else if(S_ISFIFO */
-				CIFSSMBClose(xid, pTcon, fileHandle);
-				d_drop(direntry);
-			}
-			kfree(buf);
-			/* add code here to set EAs */
-		}
+
+	cFYI(1, "sfu compat create special file");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		kfree(full_path);
+		rc = -ENOMEM;
+		FreeXid(xid);
+		return rc;
 	}
 
+	/* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+			 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto mknod_out;
+
+	/* BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely */
+
+	pdev = (struct win_dev *)buf;
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, pTcon,
+			fileHandle,
+			sizeof(struct win_dev),
+			0, &bytes_written, (char *)pdev,
+			NULL, 0);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, pTcon,
+			fileHandle,
+			sizeof(struct win_dev),
+			0, &bytes_written, (char *)pdev,
+			NULL, 0);
+	} /* else if (S_ISFIFO) */
+	CIFSSMBClose(xid, pTcon, fileHandle);
+	d_drop(direntry);
+
+	/* FIXME: add code here to set EAs */
+
+mknod_out:
 	kfree(full_path);
+	kfree(buf);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -628,6 +502,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	__u16 fileHandle = 0;
 	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *cfile;
 	struct inode *newInode = NULL;
@@ -642,7 +517,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/* check whether path exists */
 
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return (struct dentry *)tlink;
+	}
+	pTcon = tlink_tcon(tlink);
 
 	/*
 	 * Don't allow the separator character in a path component.
@@ -653,8 +533,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		for (i = 0; i < direntry->d_name.len; i++)
 			if (direntry->d_name.name[i] == '\\') {
 				cFYI(1, "Invalid file name");
-				FreeXid(xid);
-				return ERR_PTR(-EINVAL);
+				rc = -EINVAL;
+				goto lookup_out;
 			}
 	}
 
@@ -664,7 +544,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	 */
 	if (nd && (nd->flags & LOOKUP_EXCL)) {
 		d_instantiate(direntry, NULL);
-		return NULL;
+		rc = 0;
+		goto lookup_out;
 	}
 
 	/* can not grab the rename sem here since it would
@@ -672,8 +553,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	in which we already have the sb rename sem */
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
-		FreeXid(xid);
-		return ERR_PTR(-ENOMEM);
+		rc = -ENOMEM;
+		goto lookup_out;
 	}
 
 	if (direntry->d_inode != NULL) {
@@ -696,11 +577,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	if (pTcon->unix_ext) {
 		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
 		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
-		     (nd->intent.open.flags & O_CREAT)) {
+		     (nd->intent.open.file->f_flags & O_CREAT)) {
 			rc = cifs_posix_open(full_path, &newInode,
 					parent_dir_inode->i_sb,
 					nd->intent.open.create_mode,
-					nd->intent.open.flags, &oplock,
+					nd->intent.open.file->f_flags, &oplock,
 					&fileHandle, xid);
 			/*
 			 * The check below works around a bug in POSIX
@@ -736,9 +617,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 				goto lookup_out;
 			}
 
-			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
-						  nd->path.mnt,
-						  nd->intent.open.flags);
+			cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
+						  oplock);
 			if (cfile == NULL) {
 				fput(filp);
 				CIFSSMBClose(xid, pTcon, fileHandle);
@@ -768,6 +648,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 lookup_out:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return ERR_PTR(rc);
 }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 853a968e82d..0eb87026cad 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
  *   Copyright (c) 2007 Igor Mammedov
  *   Author(s): Igor Mammedov (niallain@gmail.com)
  *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
  *
  *   Contains the CIFS DFS upcall routines used for hostname to
  *   IP address translation.
@@ -24,214 +26,73 @@
  */
 
 #include <linux/slab.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <keys/user-type.h>
+#include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static const struct cred *dns_resolver_cache;
-
-/* Checks if supplied name is IP address
- * returns:
- * 		1 - name is IP
- * 		0 - name is not IP
- */
-static int
-is_ip(char *name)
-{
-	struct sockaddr_storage ss;
-
-	return cifs_convert_address(name, &ss);
-}
-
-static int
-dns_resolver_instantiate(struct key *key, const void *data,
-		size_t datalen)
-{
-	int rc = 0;
-	char *ip;
-
-	ip = kmalloc(datalen + 1, GFP_KERNEL);
-	if (!ip)
-		return -ENOMEM;
-
-	memcpy(ip, data, datalen);
-	ip[datalen] = '\0';
-
-	/* make sure this looks like an address */
-	if (!is_ip(ip)) {
-		kfree(ip);
-		return -EINVAL;
-	}
-
-	key->type_data.x[0] = datalen;
-	key->payload.data = ip;
-
-	return rc;
-}
-
-static void
-dns_resolver_destroy(struct key *key)
-{
-	kfree(key->payload.data);
-}
-
-struct key_type key_type_dns_resolver = {
-	.name        = "dns_resolver",
-	.def_datalen = sizeof(struct in_addr),
-	.describe    = user_describe,
-	.instantiate = dns_resolver_instantiate,
-	.destroy     = dns_resolver_destroy,
-	.match       = user_match,
-};
-
-/* Resolves server name to ip address.
- * input:
- * 	unc - server UNC
- * output:
- * 	*ip_addr - pointer to server ip, caller responcible for freeing it.
- * return 0 on success
+/**
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @unc: UNC path specifying the server
+ * @ip_addr: Where to return the IP address.
+ *
+ * The IP address will be returned in string form, and the caller is
+ * responsible for freeing it.
+ *
+ * Returns length of result on success, -ve on error.
  */
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
-	const struct cred *saved_cred;
-	int rc = -EAGAIN;
-	struct key *rkey = ERR_PTR(-EAGAIN);
+	struct sockaddr_storage ss;
+	const char *hostname, *sep;
 	char *name;
-	char *data = NULL;
-	int len;
+	int len, rc;
 
 	if (!ip_addr || !unc)
 		return -EINVAL;
 
-	/* search for server name delimiter */
 	len = strlen(unc);
 	if (len < 3) {
 		cFYI(1, "%s: unc is too short: %s", __func__, unc);
 		return -EINVAL;
 	}
-	len -= 2;
-	name = memchr(unc+2, '\\', len);
-	if (!name) {
-		cFYI(1, "%s: probably server name is whole unc: %s",
-					__func__, unc);
-	} else {
-		len = (name - unc) - 2/* leading // */;
-	}
-
-	name = kmalloc(len+1, GFP_KERNEL);
-	if (!name) {
-		rc = -ENOMEM;
-		return rc;
-	}
-	memcpy(name, unc+2, len);
-	name[len] = 0;
-
-	if (is_ip(name)) {
-		cFYI(1, "%s: it is IP, skipping dns upcall: %s",
-					__func__, name);
-		data = name;
-		goto skip_upcall;
-	}
 
-	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key(&key_type_dns_resolver, name, "");
-	revert_creds(saved_cred);
-	if (!IS_ERR(rkey)) {
-		if (!(rkey->perm & KEY_USR_VIEW)) {
-			down_read(&rkey->sem);
-			rkey->perm |= KEY_USR_VIEW;
-			up_read(&rkey->sem);
-		}
-		len = rkey->type_data.x[0];
-		data = rkey->payload.data;
-	} else {
-		cERROR(1, "%s: unable to resolve: %s", __func__, name);
-		goto out;
-	}
-
-skip_upcall:
-	if (data) {
-		*ip_addr = kmalloc(len + 1, GFP_KERNEL);
-		if (*ip_addr) {
-			memcpy(*ip_addr, data, len + 1);
-			if (!IS_ERR(rkey))
-				cFYI(1, "%s: resolved: %s to %s", __func__,
-							name,
-							*ip_addr
-					);
-			rc = 0;
-		} else {
-			rc = -ENOMEM;
-		}
-		if (!IS_ERR(rkey))
-			key_put(rkey);
-	}
+	/* Discount leading slashes for cifs */
+	len -= 2;
+	hostname = unc + 2;
 
-out:
-	kfree(name);
+	/* Search for server name delimiter */
+	sep = memchr(hostname, '\\', len);
+	if (sep)
+		len = sep - unc;
+	else
+		cFYI(1, "%s: probably server name is whole unc: %s",
+		     __func__, unc);
+
+	/* Try to interpret hostname as an IPv4 or IPv6 address */
+	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+	if (rc > 0)
+		goto name_is_IP_address;
+
+	/* Perform the upcall */
+	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	if (rc < 0)
+		cERROR(1, "%s: unable to resolve: %*.*s",
+		       __func__, len, len, hostname);
+	else
+		cFYI(1, "%s: resolved: %*.*s to %s",
+		     __func__, len, len, hostname, *ip_addr);
 	return rc;
-}
 
-int __init cifs_init_dns_resolver(void)
-{
-	struct cred *cred;
-	struct key *keyring;
-	int ret;
-
-	printk(KERN_NOTICE "Registering the %s key type\n",
-	       key_type_dns_resolver.name);
-
-	/* create an override credential set with a special thread keyring in
-	 * which DNS requests are cached
-	 *
-	 * this is used to prevent malicious redirections from being installed
-	 * with add_key().
-	 */
-	cred = prepare_kernel_cred(NULL);
-	if (!cred)
+name_is_IP_address:
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
 		return -ENOMEM;
-
-	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
-			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			    KEY_USR_VIEW | KEY_USR_READ,
-			    KEY_ALLOC_NOT_IN_QUOTA);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto failed_put_cred;
-	}
-
-	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-	if (ret < 0)
-		goto failed_put_key;
-
-	ret = register_key_type(&key_type_dns_resolver);
-	if (ret < 0)
-		goto failed_put_key;
-
-	/* instruct request_key() to use this special keyring as a cache for
-	 * the results it looks up */
-	cred->thread_keyring = keyring;
-	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
-	dns_resolver_cache = cred;
+	memcpy(name, hostname, len);
+	name[len] = 0;
+	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	*ip_addr = name;
 	return 0;
-
-failed_put_key:
-	key_put(keyring);
-failed_put_cred:
-	put_cred(cred);
-	return ret;
-}
-
-void cifs_exit_dns_resolver(void)
-{
-	key_revoke(dns_resolver_cache->thread_keyring);
-	unregister_key_type(&key_type_dns_resolver);
-	put_cred(dns_resolver_cache);
-	printk(KERN_NOTICE "Unregistered %s key type\n",
-	       key_type_dns_resolver.name);
 }
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5d7f291df16..d3f5d27f4d0 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
 #define _DNS_RESOLVE_H
 
 #ifdef __KERNEL__
-extern int __init cifs_init_dns_resolver(void);
-extern void cifs_exit_dns_resolver(void);
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 409e4f523e6..ae82159cf7f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -40,6 +40,7 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -59,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
 		FILE_READ_DATA);
 }
 
-static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+static u32 cifs_posix_convert_flags(unsigned int flags)
 {
-	fmode_t posix_flags = 0;
+	u32 posix_flags = 0;
 
 	if ((flags & O_ACCMODE) == O_RDONLY)
-		posix_flags = FMODE_READ;
+		posix_flags = SMB_O_RDONLY;
 	else if ((flags & O_ACCMODE) == O_WRONLY)
-		posix_flags = FMODE_WRITE;
-	else if ((flags & O_ACCMODE) == O_RDWR) {
-		/* GENERIC_ALL is too much permission to request
-		   can cause unnecessary access denied on create */
-		/* return GENERIC_ALL; */
-		posix_flags = FMODE_READ | FMODE_WRITE;
-	}
-	/* can not map O_CREAT or O_EXCL or O_TRUNC flags when
-	   reopening a file.  They had their effect on the original open */
-	if (flags & O_APPEND)
-		posix_flags |= (fmode_t)O_APPEND;
+		posix_flags = SMB_O_WRONLY;
+	else if ((flags & O_ACCMODE) == O_RDWR)
+		posix_flags = SMB_O_RDWR;
+
+	if (flags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (flags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (flags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	/* be safe and imply O_SYNC for O_DSYNC */
 	if (flags & O_DSYNC)
-		posix_flags |= (fmode_t)O_DSYNC;
-	if (flags & __O_SYNC)
-		posix_flags |= (fmode_t)__O_SYNC;
+		posix_flags |= SMB_O_SYNC;
 	if (flags & O_DIRECTORY)
-		posix_flags |= (fmode_t)O_DIRECTORY;
+		posix_flags |= SMB_O_DIRECTORY;
 	if (flags & O_NOFOLLOW)
-		posix_flags |= (fmode_t)O_NOFOLLOW;
+		posix_flags |= SMB_O_NOFOLLOW;
 	if (flags & O_DIRECT)
-		posix_flags |= (fmode_t)O_DIRECT;
+		posix_flags |= SMB_O_DIRECT;
 
 	return posix_flags;
 }
@@ -105,66 +104,8 @@ static inline int cifs_get_disposition(unsigned int flags)
 		return FILE_OPEN;
 }
 
-/* all arguments to this function must be checked for validity in caller */
-static inline int
-cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-			     struct cifsInodeInfo *pCifsInode, __u32 oplock,
-			     u16 netfid)
-{
-
-	write_lock(&GlobalSMBSeslock);
-
-	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-	if (pCifsInode == NULL) {
-		write_unlock(&GlobalSMBSeslock);
-		return -EINVAL;
-	}
-
-	if (pCifsInode->clientCanCacheRead) {
-		/* we have the inode open somewhere else
-		   no need to discard cache data */
-		goto psx_client_can_cache;
-	}
-
-	/* BB FIXME need to fix this check to move it earlier into posix_open
-	   BB  fIX following section BB FIXME */
-
-	/* if not oplocked, invalidate inode pages if mtime or file
-	   size changed */
-/*	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
-	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
-			   (file->f_path.dentry->d_inode->i_size ==
-			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, "inode unchanged on server");
-	} else {
-		if (file->f_path.dentry->d_inode->i_mapping) {
-			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
-			if (rc != 0)
-				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
-		}
-		cFYI(1, "invalidating remote inode since open detected it "
-			 "changed");
-		invalidate_remote_inode(file->f_path.dentry->d_inode);
-	} */
-
-psx_client_can_cache:
-	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = true;
-		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode);
-	} else if ((oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = true;
-
-	/* will have to change the unlock if we reenable the
-	   filemap_fdatawrite (which does not seem necessary */
-	write_unlock(&GlobalSMBSeslock);
-	return 0;
-}
-
-/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode,
-	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
+	struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
 	char *full_path, int xid)
 {
 	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -190,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
 			/* BB no need to lock inode until after invalidate
 			since namei code should already have it locked? */
 			rc = filemap_write_and_wait(inode->i_mapping);
-			if (rc != 0)
-				pCifsInode->write_behind_rc = rc;
+			mapping_set_error(inode->i_mapping, rc);
 		}
 		cFYI(1, "invalidating remote inode since open detected it "
 			 "changed");
@@ -206,16 +146,176 @@ client_can_cache:
 		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
 					 xid, NULL);
 
-	if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
 		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-	} else if ((*oplock & 0xF) == OPLOCK_READ)
+	} else if ((oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
 	return rc;
 }
 
+int cifs_posix_open(char *full_path, struct inode **pinode,
+			struct super_block *sb, int mode, unsigned int f_flags,
+			__u32 *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifs_fattr fattr;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
+
+	cFYI(1, "posix open %s", full_path);
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto posix_open_ret;
+	}
+
+	tcon = tlink_tcon(tlink);
+	mode &= ~current_umask();
+
+	posix_flags = cifs_posix_convert_flags(f_flags);
+	rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+			     poplock, full_path, cifs_sb->local_nls,
+			     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
+
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+
+	/* get new inode and set it up */
+	if (*pinode == NULL) {
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
+		if (!*pinode) {
+			rc = -ENOMEM;
+			goto posix_open_ret;
+		}
+	} else {
+		cifs_fattr_to_inode(*pinode, &fattr);
+	}
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
+struct cifsFileInfo *
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+		  struct tcon_link *tlink, __u32 oplock)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
+	struct cifsFileInfo *pCifsFile;
+
+	pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (pCifsFile == NULL)
+		return pCifsFile;
+
+	pCifsFile->count = 1;
+	pCifsFile->netfid = fileHandle;
+	pCifsFile->pid = current->tgid;
+	pCifsFile->uid = current_fsuid();
+	pCifsFile->dentry = dget(dentry);
+	pCifsFile->f_flags = file->f_flags;
+	pCifsFile->invalidHandle = false;
+	pCifsFile->tlink = cifs_get_tlink(tlink);
+	mutex_init(&pCifsFile->fh_mutex);
+	mutex_init(&pCifsFile->lock_mutex);
+	INIT_LIST_HEAD(&pCifsFile->llist);
+	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+
+	spin_lock(&cifs_file_list_lock);
+	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+	/* if readable file instance put first in list*/
+	if (file->f_mode & FMODE_READ)
+		list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+	else
+		list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+	spin_unlock(&cifs_file_list_lock);
+
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock inode %p", inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
+	file->private_data = pCifsFile;
+	return pCifsFile;
+}
+
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
+ */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+	struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+	struct cifsLockInfo *li, *tmp;
+
+	spin_lock(&cifs_file_list_lock);
+	if (--cifs_file->count > 0) {
+		spin_unlock(&cifs_file_list_lock);
+		return;
+	}
+
+	/* remove it from the lists */
+	list_del(&cifs_file->flist);
+	list_del(&cifs_file->tlist);
+
+	if (list_empty(&cifsi->openFileList)) {
+		cFYI(1, "closing last open instance for inode %p",
+			cifs_file->dentry->d_inode);
+		cifsi->clientCanCacheRead = false;
+		cifsi->clientCanCacheAll  = false;
+	}
+	spin_unlock(&cifs_file_list_lock);
+
+	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+		int xid, rc;
+
+		xid = GetXid();
+		rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
+		FreeXid(xid);
+	}
+
+	/* Delete any outstanding lock records. We'll lose them when the file
+	 * is closed anyway.
+	 */
+	mutex_lock(&cifs_file->lock_mutex);
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+		list_del(&li->llist);
+		kfree(li);
+	}
+	mutex_unlock(&cifs_file->lock_mutex);
+
+	cifs_put_tlink(cifs_file->tlink);
+	dput(cifs_file->dentry);
+	kfree(cifs_file);
+}
+
 int cifs_open(struct inode *inode, struct file *file)
 {
 	int rc = -EACCES;
@@ -223,6 +323,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink;
 	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
@@ -234,15 +335,19 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		FreeXid(xid);
+		return PTR_ERR(tlink);
+	}
+	tcon = tlink_tcon(tlink);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto out;
 	}
 
 	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
@@ -257,31 +362,22 @@ int cifs_open(struct inode *inode, struct file *file)
 	    (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-		oflags |= SMB_O_CREAT;
 		/* can not refresh inode info since size could be stale */
 		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
-				oflags, &oplock, &netfid, xid);
+				file->f_flags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
-			/* no need for special case handling of setting mode
-			   on read only files needed here */
 
-			rc = cifs_posix_open_inode_helper(inode, file,
-					pCifsInode, oplock, netfid);
-			if (rc != 0) {
-				CIFSSMBClose(xid, tcon, netfid);
-				goto out;
-			}
-
-			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
-							file->f_path.mnt,
-							oflags);
+			pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
+						      oplock);
 			if (pCifsFile == NULL) {
 				CIFSSMBClose(xid, tcon, netfid);
 				rc = -ENOMEM;
 			}
+
+			cifs_fscache_set_inode_cookie(inode, file);
+
 			goto out;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
@@ -342,7 +438,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+	if (tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
@@ -362,17 +458,18 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+	rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
 	if (rc != 0)
 		goto out;
 
-	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
-					file->f_flags);
+	pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
 	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
 	}
 
+	cifs_fscache_set_inode_cookie(inode, file);
+
 	if (oplock & CIFS_CREATE_ACTION) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
@@ -397,6 +494,7 @@ out:
 	kfree(buf);
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -411,14 +509,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 	return rc;
 }
 
-static int cifs_reopen_file(struct file *file, bool can_flush)
+static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 {
 	int rc = -EACCES;
 	int xid;
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct inode *inode;
 	char *full_path = NULL;
@@ -426,11 +523,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	int disposition = FILE_OPEN;
 	__u16 netfid;
 
-	if (file->private_data)
-		pCifsFile = (struct cifsFileInfo *)file->private_data;
-	else
-		return -EBADF;
-
 	xid = GetXid();
 	mutex_lock(&pCifsFile->fh_mutex);
 	if (!pCifsFile->invalidHandle) {
@@ -440,39 +532,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 		return rc;
 	}
 
-	if (file->f_path.dentry == NULL) {
-		cERROR(1, "no valid name if dentry freed");
-		dump_stack();
-		rc = -EBADF;
-		goto reopen_error_exit;
-	}
-
-	inode = file->f_path.dentry->d_inode;
-	if (inode == NULL) {
-		cERROR(1, "inode not valid");
-		dump_stack();
-		rc = -EBADF;
-		goto reopen_error_exit;
-	}
-
+	inode = pCifsFile->dentry->d_inode;
 	cifs_sb = CIFS_SB(inode->i_sb);
-	tcon = cifs_sb->tcon;
+	tcon = tlink_tcon(pCifsFile->tlink);
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
    to get called and if the server was down that means we end up here,
    and we can never tell if the caller already has the rename_sem */
-	full_path = build_path_from_dentry(file->f_path.dentry);
+	full_path = build_path_from_dentry(pCifsFile->dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-reopen_error_exit:
 		mutex_unlock(&pCifsFile->fh_mutex);
 		FreeXid(xid);
 		return rc;
 	}
 
 	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-		 inode, file->f_flags, full_path);
+		 inode, pCifsFile->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -482,8 +559,14 @@ reopen_error_exit:
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-		/* can not refresh inode info since size could be stale */
+
+		/*
+		 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+		 * original open. Must mask them off for a reopen.
+		 */
+		unsigned int oflags = pCifsFile->f_flags &
+						~(O_CREAT | O_EXCL | O_TRUNC);
+
 		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
 				cifs_sb->mnt_file_mode /* ignored */,
 				oflags, &oplock, &netfid, xid);
@@ -495,7 +578,7 @@ reopen_error_exit:
 		   in the reconnect path it is important to retry hard */
 	}
 
-	desiredAccess = cifs_convert_flags(file->f_flags);
+	desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
 
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
@@ -511,49 +594,49 @@ reopen_error_exit:
 		mutex_unlock(&pCifsFile->fh_mutex);
 		cFYI(1, "cifs_open returned 0x%x", rc);
 		cFYI(1, "oplock: %d", oplock);
-	} else {
+		goto reopen_error_exit;
+	}
+
 reopen_success:
-		pCifsFile->netfid = netfid;
-		pCifsFile->invalidHandle = false;
-		mutex_unlock(&pCifsFile->fh_mutex);
-		pCifsInode = CIFS_I(inode);
-		if (pCifsInode) {
-			if (can_flush) {
-				rc = filemap_write_and_wait(inode->i_mapping);
-				if (rc != 0)
-					CIFS_I(inode)->write_behind_rc = rc;
-			/* temporarily disable caching while we
-			   go to server to get inode info */
-				pCifsInode->clientCanCacheAll = false;
-				pCifsInode->clientCanCacheRead = false;
-				if (tcon->unix_ext)
-					rc = cifs_get_inode_info_unix(&inode,
-						full_path, inode->i_sb, xid);
-				else
-					rc = cifs_get_inode_info(&inode,
-						full_path, NULL, inode->i_sb,
-						xid, NULL);
-			} /* else we are writing out data to server already
-			     and could deadlock if we tried to flush data, and
-			     since we do not know if we have data that would
-			     invalidate the current end of file on the server
-			     we can not go to the server to get the new inod
-			     info */
-			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-				pCifsInode->clientCanCacheAll = true;
-				pCifsInode->clientCanCacheRead = true;
-				cFYI(1, "Exclusive Oplock granted on inode %p",
-					 file->f_path.dentry->d_inode);
-			} else if ((oplock & 0xF) == OPLOCK_READ) {
-				pCifsInode->clientCanCacheRead = true;
-				pCifsInode->clientCanCacheAll = false;
-			} else {
-				pCifsInode->clientCanCacheRead = false;
-				pCifsInode->clientCanCacheAll = false;
-			}
-			cifs_relock_file(pCifsFile);
-		}
+	pCifsFile->netfid = netfid;
+	pCifsFile->invalidHandle = false;
+	mutex_unlock(&pCifsFile->fh_mutex);
+	pCifsInode = CIFS_I(inode);
+
+	if (can_flush) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		mapping_set_error(inode->i_mapping, rc);
+
+		pCifsInode->clientCanCacheAll = false;
+		pCifsInode->clientCanCacheRead = false;
+		if (tcon->unix_ext)
+			rc = cifs_get_inode_info_unix(&inode,
+				full_path, inode->i_sb, xid);
+		else
+			rc = cifs_get_inode_info(&inode,
+				full_path, NULL, inode->i_sb,
+				xid, NULL);
+	} /* else we are writing out data to server already
+	     and could deadlock if we tried to flush data, and
+	     since we do not know if we have data that would
+	     invalidate the current end of file on the server
+	     we can not go to the server to get the new inod
+	     info */
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+			 pCifsFile->dentry->d_inode);
+	} else if ((oplock & 0xF) == OPLOCK_READ) {
+		pCifsInode->clientCanCacheRead = true;
+		pCifsInode->clientCanCacheAll = false;
+	} else {
+		pCifsInode->clientCanCacheRead = false;
+		pCifsInode->clientCanCacheAll = false;
 	}
+	cifs_relock_file(pCifsFile);
+
+reopen_error_exit:
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -561,88 +644,18 @@ reopen_success:
 
 int cifs_close(struct inode *inode, struct file *file)
 {
-	int rc = 0;
-	int xid, timeout;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	struct cifsFileInfo *pSMBFile =
-		(struct cifsFileInfo *)file->private_data;
-
-	xid = GetXid();
-
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
-	if (pSMBFile) {
-		struct cifsLockInfo *li, *tmp;
-		write_lock(&GlobalSMBSeslock);
-		pSMBFile->closePend = true;
-		if (pTcon) {
-			/* no sense reconnecting to close a file that is
-			   already closed */
-			if (!pTcon->need_reconnect) {
-				write_unlock(&GlobalSMBSeslock);
-				timeout = 2;
-				while ((atomic_read(&pSMBFile->count) != 1)
-					&& (timeout <= 2048)) {
-					/* Give write a better chance to get to
-					server ahead of the close.  We do not
-					want to add a wait_q here as it would
-					increase the memory utilization as
-					the struct would be in each open file,
-					but this should give enough time to
-					clear the socket */
-					cFYI(DBG2, "close delay, write pending");
-					msleep(timeout);
-					timeout *= 4;
-				}
-				if (!pTcon->need_reconnect &&
-				    !pSMBFile->invalidHandle)
-					rc = CIFSSMBClose(xid, pTcon,
-						  pSMBFile->netfid);
-			} else
-				write_unlock(&GlobalSMBSeslock);
-		} else
-			write_unlock(&GlobalSMBSeslock);
-
-		/* Delete any outstanding lock records.
-		   We'll lose them when the file is closed anyway. */
-		mutex_lock(&pSMBFile->lock_mutex);
-		list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
-			list_del(&li->llist);
-			kfree(li);
-		}
-		mutex_unlock(&pSMBFile->lock_mutex);
+	cifsFileInfo_put(file->private_data);
+	file->private_data = NULL;
 
-		write_lock(&GlobalSMBSeslock);
-		list_del(&pSMBFile->flist);
-		list_del(&pSMBFile->tlist);
-		write_unlock(&GlobalSMBSeslock);
-		cifsFileInfo_put(file->private_data);
-		file->private_data = NULL;
-	} else
-		rc = -EBADF;
-
-	read_lock(&GlobalSMBSeslock);
-	if (list_empty(&(CIFS_I(inode)->openFileList))) {
-		cFYI(1, "closing last open instance for inode %p", inode);
-		/* if the file is not open we do not know if we can cache info
-		   on this inode, much less write behind and read ahead */
-		CIFS_I(inode)->clientCanCacheRead = false;
-		CIFS_I(inode)->clientCanCacheAll  = false;
-	}
-	read_unlock(&GlobalSMBSeslock);
-	if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
-		rc = CIFS_I(inode)->write_behind_rc;
-	FreeXid(xid);
-	return rc;
+	/* return code from the ->release op is always ignored */
+	return 0;
 }
 
 int cifs_closedir(struct inode *inode, struct file *file)
 {
 	int rc = 0;
 	int xid;
-	struct cifsFileInfo *pCFileStruct =
-	    (struct cifsFileInfo *)file->private_data;
+	struct cifsFileInfo *pCFileStruct = file->private_data;
 	char *ptmp;
 
 	cFYI(1, "Closedir inode = 0x%p", inode);
@@ -650,25 +663,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	if (pCFileStruct) {
-		struct cifsTconInfo *pTcon;
-		struct cifs_sb_info *cifs_sb =
-			CIFS_SB(file->f_path.dentry->d_sb);
-
-		pTcon = cifs_sb->tcon;
+		struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
 
 		cFYI(1, "Freeing private data in close dir");
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, "Closing uncompleted readdir with rc %d",
 				 rc);
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
 		} else
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
 			cFYI(1, "closedir free smb buf in srch struct");
@@ -678,6 +687,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 			else
 				cifs_buf_release(ptmp);
 		}
+		cifs_put_tlink(pCFileStruct->tlink);
 		kfree(file->private_data);
 		file->private_data = NULL;
 	}
@@ -764,7 +774,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 		cFYI(1, "Unknown type of lock");
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	tcon = cifs_sb->tcon;
+	tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
@@ -863,8 +873,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 				      length, pfLock,
 				      posix_lock_type, wait_flag);
 	} else {
-		struct cifsFileInfo *fid =
-			(struct cifsFileInfo *)file->private_data;
+		struct cifsFileInfo *fid = file->private_data;
 
 		if (numLock) {
 			rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -958,14 +967,14 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
-	pTcon = cifs_sb->tcon;
-
 	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
 	   *poffset, file->f_path.dentry->d_name.name); */
 
 	if (file->private_data == NULL)
 		return -EBADF;
-	open_file = (struct cifsFileInfo *) file->private_data;
+
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	rc = generic_write_checks(file, poffset, &write_size, 0);
 	if (rc)
@@ -986,19 +995,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			   we blocked so return what we managed to write */
 				return total_written;
 			}
-			if (open_file->closePend) {
-				FreeXid(xid);
-				if (total_written)
-					return total_written;
-				else
-					return -EBADF;
-			}
 			if (open_file->invalidHandle) {
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to server
 				   now */
-				rc = cifs_reopen_file(file, false);
+				rc = cifs_reopen_file(open_file, false);
 				if (rc != 0)
 					break;
 			}
@@ -1046,8 +1048,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	return total_written;
 }
 
-static ssize_t cifs_write(struct file *file, const char *write_data,
-			  size_t write_size, loff_t *poffset)
+static ssize_t cifs_write(struct cifsFileInfo *open_file,
+			  const char *write_data, size_t write_size,
+			  loff_t *poffset)
 {
 	int rc = 0;
 	unsigned int bytes_written = 0;
@@ -1055,19 +1058,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	int xid, long_op;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+	struct dentry *dentry = open_file->dentry;
+	struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
-	pTcon = cifs_sb->tcon;
+	cifs_sb = CIFS_SB(dentry->d_sb);
 
 	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name);
+	   *poffset, dentry->d_name.name);
 
-	if (file->private_data == NULL)
-		return -EBADF;
-	open_file = (struct cifsFileInfo *)file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	xid = GetXid();
 
@@ -1076,28 +1075,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	     total_written += bytes_written) {
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if (file->private_data == NULL) {
-				/* file has been closed on us */
-				FreeXid(xid);
-			/* if we have gotten here we have written some data
-			   and blocked, and the file has been freed on us
-			   while we blocked so return what we managed to
-			   write */
-				return total_written;
-			}
-			if (open_file->closePend) {
-				FreeXid(xid);
-				if (total_written)
-					return total_written;
-				else
-					return -EBADF;
-			}
 			if (open_file->invalidHandle) {
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to
 				   server now */
-				rc = cifs_reopen_file(file, false);
+				rc = cifs_reopen_file(open_file, false);
 				if (rc != 0)
 					break;
 			}
@@ -1144,43 +1127,41 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	cifs_stats_bytes_written(pTcon, total_written);
 
-	/* since the write may have blocked check these pointers again */
-	if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-/*BB We could make this contingent on superblock ATIME flag too */
-/*		file->f_path.dentry->d_inode->i_ctime =
-		file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/
-		if (total_written > 0) {
-			spin_lock(&file->f_path.dentry->d_inode->i_lock);
-			if (*poffset > file->f_path.dentry->d_inode->i_size)
-				i_size_write(file->f_path.dentry->d_inode,
-					     *poffset);
-			spin_unlock(&file->f_path.dentry->d_inode->i_lock);
-		}
-		mark_inode_dirty_sync(file->f_path.dentry->d_inode);
+	if (total_written > 0) {
+		spin_lock(&dentry->d_inode->i_lock);
+		if (*poffset > dentry->d_inode->i_size)
+			i_size_write(dentry->d_inode, *poffset);
+		spin_unlock(&dentry->d_inode->i_lock);
 	}
+	mark_inode_dirty_sync(dentry->d_inode);
 	FreeXid(xid);
 	return total_written;
 }
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
 {
 	struct cifsFileInfo *open_file = NULL;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	/* we could simply get the first_list_entry since write-only entries
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
+		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
-		if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
-		    (open_file->pfile->f_flags & O_RDONLY))) {
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
 			if (!open_file->invalidHandle) {
 				/* found a good file */
 				/* lock it so it will not be closed on us */
 				cifsFileInfo_get(open_file);
-				read_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				return open_file;
 			} /* else might as well continue, and look for
 			     another, or simply have the caller reopen it
@@ -1188,14 +1169,16 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 		} else /* write only file */
 			break; /* write only files are last so must be done */
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return NULL;
 }
 #endif
 
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+					bool fsuid_only)
 {
 	struct cifsFileInfo *open_file;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
 	bool any_available = false;
 	int rc;
 
@@ -1209,53 +1192,39 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 		return NULL;
 	}
 
-	read_lock(&GlobalSMBSeslock);
+	/* only filter by fsuid on multiuser mounts */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+		fsuid_only = false;
+
+	spin_lock(&cifs_file_list_lock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend ||
-		    (!any_available && open_file->pid != current->tgid))
+		if (!any_available && open_file->pid != current->tgid)
 			continue;
-
-		if (open_file->pfile &&
-		    ((open_file->pfile->f_flags & O_RDWR) ||
-		     (open_file->pfile->f_flags & O_WRONLY))) {
+		if (fsuid_only && open_file->uid != current_fsuid())
+			continue;
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			cifsFileInfo_get(open_file);
 
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
-				read_unlock(&GlobalSMBSeslock);
+				spin_unlock(&cifs_file_list_lock);
 				return open_file;
 			}
 
-			read_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
+
 			/* Had to unlock since following call can block */
-			rc = cifs_reopen_file(open_file->pfile, false);
-			if (!rc) {
-				if (!open_file->closePend)
-					return open_file;
-				else { /* start over in case this was deleted */
-				       /* since the list could be modified */
-					read_lock(&GlobalSMBSeslock);
-					cifsFileInfo_put(open_file);
-					goto refind_writable;
-				}
-			}
+			rc = cifs_reopen_file(open_file, false);
+			if (!rc)
+				return open_file;
 
-			/* if it fails, try another handle if possible -
-			(we can not do this if closePending since
-			loop could be modified - in which case we
-			have to start at the beginning of the list
-			again. Note that it would be bad
-			to hold up writepages here (rather than
-			in caller) with continuous retries */
+			/* if it fails, try another handle if possible */
 			cFYI(1, "wp failed on reopen file");
-			read_lock(&GlobalSMBSeslock);
-			/* can not use this handle, no write
-			   pending on this one after all */
 			cifsFileInfo_put(open_file);
 
-			if (open_file->closePend) /* list could have changed */
-				goto refind_writable;
+			spin_lock(&cifs_file_list_lock);
+
 			/* else we simply continue to the next entry. Thus
 			   we do not loop on reopen errors.  If we
 			   can not reopen the file, for example if we
@@ -1270,7 +1239,7 @@ refind_writable:
 		any_available = true;
 		goto refind_writable;
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return NULL;
 }
 
@@ -1282,7 +1251,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	int rc = -EFAULT;
 	int bytes_written = 0;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifsFileInfo *open_file;
 
@@ -1291,7 +1259,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 
 	inode = page->mapping->host;
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
 
 	offset += (loff_t)from;
 	write_data = kmap(page);
@@ -1312,10 +1279,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	if (mapping->host->i_size - offset < (loff_t)to)
 		to = (unsigned)(mapping->host->i_size - offset);
 
-	open_file = find_writable_file(CIFS_I(mapping->host));
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
 	if (open_file) {
-		bytes_written = cifs_write(open_file->pfile, write_data,
-					   to-from, &offset);
+		bytes_written = cifs_write(open_file, write_data,
+					   to - from, &offset);
 		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1335,7 +1302,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned int bytes_to_write;
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
@@ -1350,6 +1316,7 @@ static int cifs_writepages(struct address_space *mapping,
 	int nr_pages;
 	__u64 offset = 0;
 	struct cifsFileInfo *open_file;
+	struct cifsTconInfo *tcon;
 	struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
 	struct page *page;
 	struct pagevec pvec;
@@ -1366,26 +1333,29 @@ static int cifs_writepages(struct address_space *mapping,
 	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
 		return generic_writepages(mapping, wbc);
 
-	if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
-		if (cifs_sb->tcon->ses->server->secMode &
-				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			if (!experimEnabled)
-				return generic_writepages(mapping, wbc);
-
 	iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
 	if (iov == NULL)
 		return generic_writepages(mapping, wbc);
 
-
 	/*
-	 * BB: Is this meaningful for a non-block-device file system?
-	 * If it is, we should test it again after we do I/O
+	 * if there's no open file, then this is likely to fail too,
+	 * but it'll at least handle the return. Maybe it should be
+	 * a BUG() instead?
 	 */
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
+	open_file = find_writable_file(CIFS_I(mapping->host), false);
+	if (!open_file) {
 		kfree(iov);
-		return 0;
+		return generic_writepages(mapping, wbc);
+	}
+
+	tcon = tlink_tcon(open_file->tlink);
+	if (!experimEnabled && tcon->ses->server->secMode &
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		cifsFileInfo_put(open_file);
+		kfree(iov);
+		return generic_writepages(mapping, wbc);
 	}
+	cifsFileInfo_put(open_file);
 
 	xid = GetXid();
 
@@ -1490,38 +1460,29 @@ retry:
 				break;
 		}
 		if (n_iov) {
-			/* Search for a writable handle every time we call
-			 * CIFSSMBWrite2.  We can't rely on the last handle
-			 * we used to still be valid
-			 */
-			open_file = find_writable_file(CIFS_I(mapping->host));
+			open_file = find_writable_file(CIFS_I(mapping->host),
+							false);
 			if (!open_file) {
 				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
 				long_op = cifs_write_timeout(cifsi, offset);
-				rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
-						   open_file->netfid,
+				rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
 						   long_op);
 				cifsFileInfo_put(open_file);
 				cifs_update_eof(cifsi, offset, bytes_written);
+			}
 
-				if (rc || bytes_written < bytes_to_write) {
-					cERROR(1, "Write2 ret %d, wrote %d",
-						  rc, bytes_written);
-					/* BB what if continued retry is
-					   requested via mount flags? */
-					if (rc == -ENOSPC)
-						set_bit(AS_ENOSPC, &mapping->flags);
-					else
-						set_bit(AS_EIO, &mapping->flags);
-				} else {
-					cifs_stats_bytes_written(cifs_sb->tcon,
-								 bytes_written);
-				}
+			if (rc || bytes_written < bytes_to_write) {
+				cERROR(1, "Write2 ret %d, wrote %d",
+					  rc, bytes_written);
+				mapping_set_error(mapping, rc);
+			} else {
+				cifs_stats_bytes_written(tcon, bytes_written);
 			}
+
 			for (i = 0; i < n_iov; i++) {
 				page = pvec.pages[first + i];
 				/* Should we also set page error on
@@ -1622,7 +1583,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 		/* BB check if anything else missing out of ppw
 		   such as updating last write time */
 		page_data = kmap(page);
-		rc = cifs_write(file, page_data + offset, copied, &pos);
+		rc = cifs_write(file->private_data, page_data + offset,
+				copied, &pos);
 		/* if (rc < 0) should we set writebehind rc? */
 		kunmap(page);
 
@@ -1651,8 +1613,7 @@ int cifs_fsync(struct file *file, int datasync)
 	int xid;
 	int rc = 0;
 	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *smbfile =
-		(struct cifsFileInfo *)file->private_data;
+	struct cifsFileInfo *smbfile = file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	xid = GetXid();
@@ -1662,11 +1623,10 @@ int cifs_fsync(struct file *file, int datasync)
 
 	rc = filemap_write_and_wait(inode->i_mapping);
 	if (rc == 0) {
-		rc = CIFS_I(inode)->write_behind_rc;
-		CIFS_I(inode)->write_behind_rc = 0;
-		tcon = CIFS_SB(inode->i_sb)->tcon;
-		if (!rc && tcon && smbfile &&
-		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+		struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+		tcon = tlink_tcon(smbfile->tlink);
+		if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
 			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 	}
 
@@ -1711,21 +1671,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int rc = 0;
 
-	/* Rather than do the steps manually:
-	   lock the inode for writing
-	   loop through pages looking for write behind data (dirty pages)
-	   coalesce into contiguous 16K (or smaller) chunks to write to server
-	   send to server (prefer in parallel)
-	   deal with writebehind errors
-	   unlock inode for writing
-	   filemapfdatawrite appears easier for the time being */
-
-	rc = filemap_fdatawrite(inode->i_mapping);
-	/* reset wb rc if we were able to write out dirty pages */
-	if (!rc) {
-		rc = CIFS_I(inode)->write_behind_rc;
-		CIFS_I(inode)->write_behind_rc = 0;
-	}
+	if (file->f_mode & FMODE_WRITE)
+		rc = filemap_write_and_wait(inode->i_mapping);
 
 	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
 
@@ -1749,14 +1696,14 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
 		FreeXid(xid);
 		return rc;
 	}
-	open_file = (struct cifsFileInfo *)file->private_data;
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1770,9 +1717,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 		smb_read_data = NULL;
 		while (rc == -EAGAIN) {
 			int buf_type = CIFS_NO_BUFFER;
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
@@ -1830,14 +1776,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 
 	xid = GetXid();
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	if (file->private_data == NULL) {
 		rc = -EBADF;
 		FreeXid(xid);
 		return rc;
 	}
-	open_file = (struct cifsFileInfo *)file->private_data;
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
@@ -1856,9 +1802,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		}
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
@@ -1942,6 +1887,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		SetPageUptodate(page);
 		unlock_page(page);
 		data += PAGE_CACHE_SIZE;
+
+		/* add page to FS-Cache */
+		cifs_readpage_to_fscache(mapping->host, page);
 	}
 	return;
 }
@@ -1968,9 +1916,18 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		FreeXid(xid);
 		return rc;
 	}
-	open_file = (struct cifsFileInfo *)file->private_data;
+	open_file = file->private_data;
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
+	pTcon = tlink_tcon(open_file->tlink);
+
+	/*
+	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
+	 * immediately if the cookie is negative
+	 */
+	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+					 &num_pages);
+	if (rc == 0)
+		goto read_complete;
 
 	cFYI(DBG2, "rpages: num pages %d", num_pages);
 	for (i = 0; i < num_pages; ) {
@@ -2009,9 +1966,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				read_size, contig_pages);
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
-			if ((open_file->invalidHandle) &&
-			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, true);
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, true);
 				if (rc != 0)
 					break;
 			}
@@ -2082,6 +2038,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		smb_read_data = NULL;
 	}
 
+read_complete:
 	FreeXid(xid);
 	return rc;
 }
@@ -2092,6 +2049,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	char *read_data;
 	int rc;
 
+	/* Is the page cached? */
+	rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+	if (rc == 0)
+		goto read_complete;
+
 	page_cache_get(page);
 	read_data = kmap(page);
 	/* for reads over a certain size could initiate async read ahead */
@@ -2111,11 +2073,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 
 	flush_dcache_page(page);
 	SetPageUptodate(page);
+
+	/* send this page to the cache */
+	cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+
 	rc = 0;
 
 io_error:
 	kunmap(page);
 	page_cache_release(page);
+
+read_complete:
 	return rc;
 }
 
@@ -2148,18 +2116,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
 
-	read_lock(&GlobalSMBSeslock);
+	spin_lock(&cifs_file_list_lock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
-			continue;
-		if (open_file->pfile &&
-		    ((open_file->pfile->f_flags & O_RDWR) ||
-		     (open_file->pfile->f_flags & O_WRONLY))) {
-			read_unlock(&GlobalSMBSeslock);
+		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
+			spin_unlock(&cifs_file_list_lock);
 			return 1;
 		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	spin_unlock(&cifs_file_list_lock);
 	return 0;
 }
 
@@ -2265,15 +2229,29 @@ out:
 	return rc;
 }
 
-static void
-cifs_oplock_break(struct slow_work *work)
+static int cifs_release_page(struct page *page, gfp_t gfp)
+{
+	if (PagePrivate(page))
+		return 0;
+
+	return cifs_fscache_release_page(page, gfp);
+}
+
+static void cifs_invalidate_page(struct page *page, unsigned long offset)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+
+	if (offset == 0)
+		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+
+void cifs_oplock_break(struct work_struct *work)
 {
 	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
 						  oplock_break);
-	struct inode *inode = cfile->pInode;
+	struct inode *inode = cfile->dentry->d_inode;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
-	int rc, waitrc = 0;
+	int rc = 0;
 
 	if (inode && S_ISREG(inode->i_mode)) {
 		if (cinode->clientCanCacheRead)
@@ -2282,13 +2260,10 @@ cifs_oplock_break(struct slow_work *work)
 			break_lease(inode, O_WRONLY);
 		rc = filemap_fdatawrite(inode->i_mapping);
 		if (cinode->clientCanCacheRead == 0) {
-			waitrc = filemap_fdatawait(inode->i_mapping);
+			rc = filemap_fdatawait(inode->i_mapping);
+			mapping_set_error(inode->i_mapping, rc);
 			invalidate_remote_inode(inode);
 		}
-		if (!rc)
-			rc = waitrc;
-		if (rc)
-			cinode->write_behind_rc = rc;
 		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
 	}
 
@@ -2298,38 +2273,36 @@ cifs_oplock_break(struct slow_work *work)
 	 * not bother sending an oplock release if session to server still is
 	 * disconnected since oplock already released by the server
 	 */
-	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
-		rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
-				 LOCKING_ANDX_OPLOCK_RELEASE, false);
+	if (!cfile->oplock_break_cancelled) {
+		rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
+				 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
 		cFYI(1, "Oplock release rc = %d", rc);
 	}
+
+	/*
+	 * We might have kicked in before is_valid_oplock_break()
+	 * finished grabbing reference for us.  Make sure it's done by
+	 * waiting for cifs_file_list_lock.
+	 */
+	spin_lock(&cifs_file_list_lock);
+	spin_unlock(&cifs_file_list_lock);
+
+	cifs_oplock_break_put(cfile);
 }
 
-static int
-cifs_oplock_break_get(struct slow_work *work)
+/* must be called while holding cifs_file_list_lock */
+void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
-	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-						  oplock_break);
-	mntget(cfile->mnt);
+	cifs_sb_active(cfile->dentry->d_sb);
 	cifsFileInfo_get(cfile);
-	return 0;
 }
 
-static void
-cifs_oplock_break_put(struct slow_work *work)
+void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
-	struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-						  oplock_break);
-	mntput(cfile->mnt);
 	cifsFileInfo_put(cfile);
+	cifs_sb_deactive(cfile->dentry->d_sb);
 }
 
-const struct slow_work_ops cifs_oplock_break_ops = {
-	.get_ref	= cifs_oplock_break_get,
-	.put_ref	= cifs_oplock_break_put,
-	.execute	= cifs_oplock_break,
-};
-
 const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
 	.readpages = cifs_readpages,
@@ -2338,6 +2311,8 @@ const struct address_space_operations cifs_addr_ops = {
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.invalidatepage = cifs_invalidate_page,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
@@ -2354,6 +2329,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
+	.releasepage = cifs_release_page,
+	.invalidatepage = cifs_invalidate_page,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 00000000000..a2ad94efcfe
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,237 @@
+/*
+ *   fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+	server->fscache =
+		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+				&cifs_fscache_server_index_def, server);
+	cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
+				server->fscache);
+}
+
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+	cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
+				server->fscache);
+	fscache_relinquish_cookie(server->fscache, 0);
+	server->fscache = NULL;
+}
+
+void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+{
+	struct TCP_Server_Info *server = tcon->ses->server;
+
+	tcon->fscache =
+		fscache_acquire_cookie(server->fscache,
+				&cifs_fscache_super_index_def, tcon);
+	cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
+				server->fscache, tcon->fscache);
+}
+
+void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+{
+	cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+	fscache_relinquish_cookie(tcon->fscache, 0);
+	tcon->fscache = NULL;
+}
+
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (cifsi->fscache)
+		return;
+
+	cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+				&cifs_fscache_inode_object_def, cifsi);
+	cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+				cifsi->fscache);
+}
+
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cFYI(1, "CIFS releasing inode cookie (0x%p)",
+				cifsi->fscache);
+		fscache_relinquish_cookie(cifsi->fscache, 0);
+		cifsi->fscache = NULL;
+	}
+}
+
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+	if (cifsi->fscache) {
+		cFYI(1, "CIFS disabling inode cookie (0x%p)",
+				cifsi->fscache);
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+		cifsi->fscache = NULL;
+	}
+}
+
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+		cifs_fscache_disable_inode_cookie(inode);
+	else {
+		cifs_fscache_enable_inode_cookie(inode);
+		cFYI(1, "CIFS: fscache inode cookie set");
+	}
+}
+
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct fscache_cookie *old = cifsi->fscache;
+
+	if (cifsi->fscache) {
+		/* retire the current fscache cache and get a new one */
+		fscache_relinquish_cookie(cifsi->fscache, 1);
+
+		cifsi->fscache = fscache_acquire_cookie(
+					cifs_sb_master_tcon(cifs_sb)->fscache,
+					&cifs_fscache_inode_object_def,
+					cifsi);
+		cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
+				cifsi->fscache, old);
+	}
+}
+
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (PageFsCache(page)) {
+		struct inode *inode = page->mapping->host;
+		struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+		cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
+				page, cifsi->fscache);
+		if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+			return 0;
+	}
+
+	return 1;
+}
+
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+						int error)
+{
+	cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
+			page, error);
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
+			CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+					 cifs_readpage_from_fscache_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+
+	case 0: /* page found in fscache, read submitted */
+		cFYI(1, "CIFS: readpage_from_fscache: submitted");
+		return ret;
+	case -ENOBUFS:	/* page won't be cached */
+	case -ENODATA:	/* page not in cache */
+		cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+		return 1;
+
+	default:
+		cERROR(1, "unknown error ret = %d", ret);
+	}
+	return ret;
+}
+
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned *nr_pages)
+{
+	int ret;
+
+	cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
+			CIFS_I(inode)->fscache, *nr_pages, inode);
+	ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+					  pages, nr_pages,
+					  cifs_readpage_from_fscache_complete,
+					  NULL,
+					  mapping_gfp_mask(mapping));
+	switch (ret) {
+	case 0:	/* read submitted to the cache for all pages */
+		cFYI(1, "CIFS: readpages_from_fscache: submitted");
+		return ret;
+
+	case -ENOBUFS:	/* some pages are not cached and can't be */
+	case -ENODATA:	/* some pages are not cached */
+		cFYI(1, "CIFS: readpages_from_fscache: no page");
+		return 1;
+
+	default:
+		cFYI(1, "unknown error ret = %d", ret);
+	}
+
+	return ret;
+}
+
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+	int ret;
+
+	cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
+			CIFS_I(inode)->fscache, page, inode);
+	ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+	if (ret != 0)
+		fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+	struct cifsInodeInfo *cifsi = CIFS_I(inode);
+	struct fscache_cookie *cookie = cifsi->fscache;
+
+	cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+	fscache_wait_on_page_write(cookie, page);
+	fscache_uncache_page(cookie, page);
+}
+
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 00000000000..31b88ec2341
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
+/*
+ *   fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+
+#include <linux/fscache.h>
+
+#include "cifsglob.h"
+
+#ifdef CONFIG_CIFS_FSCACHE
+
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+					 struct address_space *,
+					 struct list_head *,
+					 unsigned *);
+
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+					       struct inode *inode)
+{
+	if (PageFsCache(page))
+		__cifs_fscache_invalidate_page(page, inode);
+}
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+					     struct page *page)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpage_from_fscache(inode, page);
+
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_readpages_from_fscache(inode, mapping, pages,
+						     nr_pages);
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+					    struct page *page)
+{
+	if (PageFsCache(page))
+		__cifs_readpage_to_fscache(inode, page);
+}
+
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+						 struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	return 1; /* May release page */
+}
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+			struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+	return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+					      struct address_space *mapping,
+					      struct list_head *pages,
+					      unsigned *nr_pages)
+{
+	return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+			struct page *page) {}
+
+#endif /* CONFIG_CIFS_FSCACHE */
+
+#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6f0683c6895..39869c3c3ef 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -29,6 +29,7 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 
 
 static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -51,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 
 
 		/* check if server can support readpages */
-		if (cifs_sb->tcon->ses->server->maxBuf <
+		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
 				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
@@ -287,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
 	struct cifs_fattr fattr;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
-	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
 
 	xid = GetXid();
 	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -312,15 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	FILE_UNIX_BASIC_INFO find_data;
 	struct cifs_fattr fattr;
 	struct cifsTconInfo *tcon;
+	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	tcon = cifs_sb->tcon;
 	cFYI(1, "Getting info on %s", full_path);
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	/* could have done a find first instead but this returns more info */
 	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 
 	if (!rc) {
 		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -331,6 +338,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 		return rc;
 	}
 
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
 	if (*pinode == NULL) {
 		/* get new inode */
 		cifs_fill_uniqueid(sb, &fattr);
@@ -352,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	int rc;
 	int oplock = 0;
 	__u16 netfid;
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	char buf[24];
 	unsigned int bytes_read;
 	char *pbuf;
@@ -371,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 		return -EINVAL;	 /* EOPNOTSUPP? */
 	}
 
-	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags &
@@ -379,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 	if (rc == 0) {
 		int buf_type = CIFS_NO_BUFFER;
 			/* Read header */
-		rc = CIFSSMBRead(xid, pTcon, netfid,
+		rc = CIFSSMBRead(xid, tcon, netfid,
 				 24 /* length */, 0 /* offset */,
 				 &bytes_read, &pbuf, &buf_type);
 		if ((rc == 0) && (bytes_read >= 8)) {
@@ -421,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 			fattr->cf_dtype = DT_REG;
 			rc = -EOPNOTSUPP; /* or some unknown SFU type */
 		}
-		CIFSSMBClose(xid, pTcon, netfid);
+		CIFSSMBClose(xid, tcon, netfid);
 	}
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -440,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	ssize_t rc;
 	char ea_value[4];
 	__u32 mode;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
+	rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
 			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
 			    cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 	if (rc < 0)
 		return (int)rc;
 	else if (rc > 3) {
@@ -467,6 +496,8 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
 	if (info->DeletePending)
@@ -481,8 +512,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
 
 	if (adjust_tz) {
-		fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
-		fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+		fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
+		fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
 	}
 
 	fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -514,8 +545,8 @@ int cifs_get_file_info(struct file *filp)
 	struct cifs_fattr fattr;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
-	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+	struct cifsFileInfo *cfile = filp->private_data;
+	struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
 
 	xid = GetXid();
 	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -553,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
 {
 	int rc = 0, tmprc;
 	struct cifsTconInfo *pTcon;
+	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	char *buf = NULL;
 	bool adjustTZ = false;
 	struct cifs_fattr fattr;
 
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	cFYI(1, "Getting info on %s", full_path);
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
 			cFYI(1, "No need to revalidate cached inode sizes");
-			return rc;
+			goto cgii_exit;
 		}
 	}
 
 	/* if file info not passed in then get it from server */
 	if (pfindData == NULL) {
 		buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-		if (buf == NULL)
-			return -ENOMEM;
+		if (buf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 		pfindData = (FILE_ALL_INFO *)buf;
 
 		/* could do find first instead but this returns more info */
@@ -660,6 +698,13 @@ int cifs_get_inode_info(struct inode **pinode,
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
 		cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
 
+	/* check for Minshall+French symlinks */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		if (tmprc)
+			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+	}
+
 	if (!*pinode) {
 		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode)
@@ -670,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
 
 cgii_exit:
 	kfree(buf);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -682,6 +728,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
 	char *full_path = NULL;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
 	/* if no prefix path, simply set path to the root of share to "" */
 	if (pplen == 0) {
@@ -691,8 +738,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 	}
 
-	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
-		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
 
@@ -701,7 +748,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 		return full_path;
 
 	if (dfsplen) {
-		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		strncpy(full_path, tcon->treeName, dfsplen);
 		/* switch slash direction in prepath depending on whether
 		 * windows or posix style path names
 		 */
@@ -723,18 +770,17 @@ cifs_find_inode(struct inode *inode, void *opaque)
 {
 	struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
 
+	/* don't match inode with different uniqueid */
 	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
 		return 0;
 
-	/*
-	 * uh oh -- it's a directory. We can't use it since hardlinked dirs are
-	 * verboten. Disable serverino and return it as if it were found, the
-	 * caller can discard it, generate a uniqueid and retry the find
-	 */
-	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+		return 0;
+
+	/* if it's not a directory or has no dentries, then flag it */
+	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
 		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
-		cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
-	}
 
 	return 1;
 }
@@ -748,6 +794,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
 	return 0;
 }
 
+/*
+ * walk dentry list for an inode and report whether it has aliases that
+ * are hashed. We use this to determine if a directory inode can actually
+ * be used.
+ */
+static bool
+inode_has_hashed_dentries(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+			spin_unlock(&dcache_lock);
+			return true;
+		}
+	}
+	spin_unlock(&dcache_lock);
+	return false;
+}
+
 /* Given fattrs, get a corresponding inode */
 struct inode *
 cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -763,12 +830,16 @@ retry_iget5_locked:
 
 	inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
 	if (inode) {
-		/* was there a problematic inode number collision? */
+		/* was there a potentially problematic inode collision? */
 		if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
-			iput(inode);
-			fattr->cf_uniqueid = iunique(sb, ROOT_I);
 			fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
-			goto retry_iget5_locked;
+
+			if (inode_has_hashed_dentries(inode)) {
+				cifs_autodisable_serverino(CIFS_SB(sb));
+				iput(inode);
+				fattr->cf_uniqueid = iunique(sb, ROOT_I);
+				goto retry_iget5_locked;
+			}
 		}
 
 		cifs_fattr_to_inode(inode, fattr);
@@ -776,6 +847,12 @@ retry_iget5_locked:
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
+			if (S_ISREG(inode->i_mode))
+				inode->i_data.backing_dev_info = sb->s_bdi;
+#ifdef CONFIG_CIFS_FSCACHE
+			/* initialize per-inode cache cookie pointer */
+			CIFS_I(inode)->fscache = NULL;
+#endif
 			unlock_new_inode(inode);
 		}
 	}
@@ -787,27 +864,32 @@ retry_iget5_locked:
 struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
 	int xid;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct inode *inode = NULL;
 	long rc;
 	char *full_path;
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 
-	cifs_sb = CIFS_SB(sb);
 	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	xid = GetXid();
-	if (cifs_sb->tcon->unix_ext)
+	if (tcon->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
 						xid, NULL);
 
 	if (!inode)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(rc);
+
+#ifdef CONFIG_CIFS_FSCACHE
+	/* populate tcon->resource_id */
+	tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
 
-	if (rc && cifs_sb->tcon->ipc) {
+	if (rc && tcon->ipc) {
 		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
 		inode->i_nlink = 2;
@@ -843,7 +925,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct tcon_link *tlink = NULL;
+	struct cifsTconInfo *pTcon;
 	FILE_BASIC_INFO	info_buf;
 
 	if (attrs == NULL)
@@ -882,13 +965,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	/*
 	 * If the file is already open for write, just use that fileid
 	 */
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		netfid = open_file->netfid;
 		netpid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
 		goto set_via_filehandle;
 	}
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	pTcon = tlink_tcon(tlink);
+
 	/*
 	 * NT4 apparently returns success on this call, but it doesn't
 	 * really work.
@@ -932,6 +1024,8 @@ set_via_filehandle:
 	else
 		cifsFileInfo_put(open_file);
 out:
+	if (tlink != NULL)
+		cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -949,10 +1043,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	__u32 dosattr, origattr;
 	FILE_BASIC_INFO *info_buf = NULL;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1021,6 +1121,7 @@ out_close:
 	CIFSSMBClose(xid, tcon, netfid);
 out:
 	kfree(info_buf);
+	cifs_put_tlink(tlink);
 	return rc;
 
 	/*
@@ -1060,12 +1161,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct cifsInodeInfo *cifs_inode;
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *tcon;
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
 	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
 	xid = GetXid();
 
 	/* Unlink can be called from rename so we can not take the
@@ -1073,8 +1180,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto unlink_out;
 	}
 
 	if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1140,10 +1246,11 @@ out_reval:
 	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
 	cifs_inode = CIFS_I(dir);
 	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
-
+unlink_out:
 	kfree(full_path);
 	kfree(attrs);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1152,6 +1259,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	int rc = 0, tmprc;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
@@ -1159,16 +1267,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 
 	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto mkdir_out;
 	}
 
 	if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1326,6 +1436,7 @@ mkdir_get_info:
 mkdir_out:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1334,6 +1445,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	int rc = 0;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
@@ -1342,18 +1454,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
-
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto rmdir_exit;
+	}
+
+	cifs_sb = CIFS_SB(inode->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto rmdir_exit;
 	}
+	pTcon = tlink_tcon(tlink);
 
 	rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	cifs_put_tlink(tlink);
 
 	if (!rc) {
 		drop_nlink(inode);
@@ -1374,6 +1491,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
 		current_fs_time(inode->i_sb);
 
+rmdir_exit:
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1384,10 +1502,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 		struct dentry *to_dentry, const char *toPath)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *pTcon;
 	__u16 srcfid;
 	int oplock, rc;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	/* try path-based rename first */
 	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
 			   cifs_sb->mnt_cifs_flags &
@@ -1399,11 +1523,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 	 * rename by filehandle to various Windows servers.
 	 */
 	if (rc == 0 || rc != -ETXTBSY)
-		return rc;
+		goto do_rename_exit;
 
 	/* open-file renames don't work across directories */
 	if (to_dentry->d_parent != from_dentry->d_parent)
-		return rc;
+		goto do_rename_exit;
 
 	/* open the file to be renamed -- we need DELETE perms */
 	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1419,7 +1543,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 
 		CIFSSMBClose(xid, pTcon, srcfid);
 	}
-
+do_rename_exit:
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1428,29 +1553,22 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 {
 	char *fromName = NULL;
 	char *toName = NULL;
-	struct cifs_sb_info *cifs_sb_source;
-	struct cifs_sb_info *cifs_sb_target;
+	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *tcon;
 	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
 	FILE_UNIX_BASIC_INFO *info_buf_target;
 	int xid, rc, tmprc;
 
-	cifs_sb_target = CIFS_SB(target_dir->i_sb);
-	cifs_sb_source = CIFS_SB(source_dir->i_sb);
-	tcon = cifs_sb_source->tcon;
+	cifs_sb = CIFS_SB(source_dir->i_sb);
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
 
 	xid = GetXid();
 
 	/*
-	 * BB: this might be allowed if same server, but different share.
-	 * Consider adding support for this
-	 */
-	if (tcon != cifs_sb_target->tcon) {
-		rc = -EXDEV;
-		goto cifs_rename_exit;
-	}
-
-	/*
 	 * we already have the rename sem so we do not need to
 	 * grab it again here to protect the path integrity
 	 */
@@ -1485,17 +1603,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 		info_buf_target = info_buf_source + 1;
 		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
 					info_buf_source,
-					cifs_sb_source->local_nls,
-					cifs_sb_source->mnt_cifs_flags &
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if (tmprc != 0)
 			goto unlink_target;
 
-		tmprc = CIFSSMBUnixQPathInfo(xid, tcon,
-					toName, info_buf_target,
-					cifs_sb_target->local_nls,
-					/* remap based on source sb */
-					cifs_sb_source->mnt_cifs_flags &
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
+					info_buf_target,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (tmprc == 0 && (info_buf_source->UniqueId ==
@@ -1523,6 +1640,7 @@ cifs_rename_exit:
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -1564,21 +1682,22 @@ cifs_invalidate_mapping(struct inode *inode)
 	/* write back any cached data */
 	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
 		rc = filemap_write_and_wait(inode->i_mapping);
-		if (rc)
-			cifs_i->write_behind_rc = rc;
+		mapping_set_error(inode->i_mapping, rc);
 	}
 	invalidate_remote_inode(inode);
+	cifs_fscache_reset_inode_cookie(inode);
 }
 
 int cifs_revalidate_file(struct file *filp)
 {
 	int rc = 0;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
 
 	if (!cifs_inode_needs_reval(inode))
 		goto check_inval;
 
-	if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+	if (tlink_tcon(cfile->tlink)->unix_ext)
 		rc = cifs_get_file_info_unix(filp);
 	else
 		rc = cifs_get_file_info(filp);
@@ -1619,7 +1738,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		 "jiffies %ld", full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
-	if (CIFS_SB(sb)->tcon->unix_ext)
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
 	else
 		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1635,13 +1754,29 @@ check_inval:
 }
 
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-	struct kstat *stat)
+		 struct kstat *stat)
 {
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
 	int err = cifs_revalidate_dentry(dentry);
+
 	if (!err) {
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
 		stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+
+		/*
+		 * If on a multiuser mount without unix extensions, and the
+		 * admin hasn't overridden them, set the ownership to the
+		 * fsuid/fsgid of the current process.
+		 */
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+		    !tcon->unix_ext) {
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+				stat->uid = current_fsuid();
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+				stat->gid = current_fsgid();
+		}
 	}
 	return err;
 }
@@ -1663,26 +1798,16 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 	return rc;
 }
 
-static int cifs_vmtruncate(struct inode *inode, loff_t offset)
+static void cifs_setsize(struct inode *inode, loff_t offset)
 {
 	loff_t oldsize;
-	int err;
 
 	spin_lock(&inode->i_lock);
-	err = inode_newsize_ok(inode, offset);
-	if (err) {
-		spin_unlock(&inode->i_lock);
-		goto out;
-	}
-
 	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
+
 	truncate_pagecache(inode, oldsize, offset);
-	if (inode->i_op->truncate)
-		inode->i_op->truncate(inode);
-out:
-	return err;
 }
 
 static int
@@ -1693,7 +1818,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	struct cifsFileInfo *open_file;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct tcon_link *tlink = NULL;
+	struct cifsTconInfo *pTcon = NULL;
 
 	/*
 	 * To avoid spurious oplock breaks from server, in the case of
@@ -1704,10 +1830,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	 * writebehind data than the SMB timeout for the SetPathInfo
 	 * request would allow
 	 */
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		__u16 nfid = open_file->netfid;
 		__u32 npid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
 		cifsFileInfo_put(open_file);
@@ -1722,6 +1849,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		rc = -EINVAL;
 
 	if (rc != 0) {
+		if (pTcon == NULL) {
+			tlink = cifs_sb_tlink(cifs_sb);
+			if (IS_ERR(tlink))
+				return PTR_ERR(tlink);
+			pTcon = tlink_tcon(tlink);
+		}
+
 		/* Set file size by pathname rather than by handle
 		   either because no valid, writeable file handle for
 		   it was found or because there was an error setting
@@ -1751,11 +1885,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 				CIFSSMBClose(xid, pTcon, netfid);
 			}
 		}
+		if (tlink)
+			cifs_put_tlink(tlink);
 	}
 
 	if (rc == 0) {
 		cifsInode->server_eof = attrs->ia_size;
-		rc = cifs_vmtruncate(inode, attrs->ia_size);
+		cifs_setsize(inode, attrs->ia_size);
 		cifs_truncate_page(inode->i_mapping, inode->i_size);
 	}
 
@@ -1771,7 +1907,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct inode *inode = direntry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *pTcon;
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
@@ -1780,14 +1917,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 
 	xid = GetXid();
 
-	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
-		/* check if we have permission to change attrs */
-		rc = inode_change_ok(inode, attrs);
-		if (rc < 0)
-			goto out;
-		else
-			rc = 0;
-	}
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0)
+		goto out;
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -1807,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc != 0) {
-		cifsInode->write_behind_rc = rc;
-		rc = 0;
-	}
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1860,31 +1993,45 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		args->ctime = NO_CHANGE_64;
 
 	args->device = 0;
-	open_file = find_writable_file(cifsInode);
+	open_file = find_writable_file(cifsInode, true);
 	if (open_file) {
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
+		pTcon = tlink_tcon(open_file->tlink);
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
 		cifsFileInfo_put(open_file);
 	} else {
+		tlink = cifs_sb_tlink(cifs_sb);
+		if (IS_ERR(tlink)) {
+			rc = PTR_ERR(tlink);
+			goto out;
+		}
+		pTcon = tlink_tcon(tlink);
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
 				    cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cifs_put_tlink(tlink);
 	}
 
-	if (!rc) {
-		rc = inode_setattr(inode, attrs);
+	if (rc)
+		goto out;
 
-		/* force revalidate when any of these times are set since some
-		   of the fs types (eg ext3, fat) do not have fine enough
-		   time granularity to match protocol, and we do not have a
-		   a way (yet) to query the server fs's time granularity (and
-		   whether it rounds times down).
-		*/
-		if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
-			cifsInode->time = 0;
-	}
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+	/* force revalidate when any of these times are set since some
+	   of the fs types (eg ext3, fat) do not have fine enough
+	   time granularity to match protocol, and we do not have a
+	   a way (yet) to query the server fs's time granularity (and
+	   whether it rounds times down).
+	*/
+	if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		cifsInode->time = 0;
 out:
 	kfree(args);
 	kfree(full_path);
@@ -1909,14 +2056,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
 		 direntry->d_name.name, attrs->ia_valid);
 
-	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
-		/* check if we have permission to change attrs */
-		rc = inode_change_ok(inode, attrs);
-		if (rc < 0) {
-			FreeXid(xid);
-			return rc;
-		} else
-			rc = 0;
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+		attrs->ia_valid |= ATTR_FORCE;
+
+	rc = inode_change_ok(inode, attrs);
+	if (rc < 0) {
+		FreeXid(xid);
+		return rc;
 	}
 
 	full_path = build_path_from_dentry(direntry);
@@ -1938,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	 * the flush returns error?
 	 */
 	rc = filemap_write_and_wait(inode->i_mapping);
-	if (rc != 0) {
-		cifsInode->write_behind_rc = rc;
-		rc = 0;
-	}
+	mapping_set_error(inode->i_mapping, rc);
+	rc = 0;
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2024,8 +2168,17 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	/* do not need local check to inode_check_ok since the server does
 	   that */
-	if (!rc)
-		rc = inode_setattr(inode, attrs);
+	if (rc)
+		goto cifs_setattr_exit;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode))
+		truncate_setsize(inode, attrs->ia_size);
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+	return 0;
+
 cifs_setattr_exit:
 	kfree(full_path);
 	FreeXid(xid);
@@ -2037,7 +2190,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (pTcon->unix_ext)
 		return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 505926f1ee6..077bf756f34 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,12 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	int xid;
 	struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
+	struct cifsFileInfo *pSMBFile = filep->private_data;
+	struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
 	__u64	ExtAttrBits = 0;
 	__u64	ExtAttrMask = 0;
-	__u64   caps;
-	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *pSMBFile =
-		(struct cifsFileInfo *)filep->private_data;
+	__u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
 #endif /* CONFIG_CIFS_POSIX */
 
 	xid = GetXid();
@@ -51,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
-#ifdef CONFIG_CIFS_POSIX
-	tcon = cifs_sb->tcon;
-	if (tcon)
-		caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-	else {
-		rc = -EIO;
-		FreeXid(xid);
-		return -EIO;
-	}
-#endif /* CONFIG_CIFS_POSIX */
-
 	switch (command) {
 		case CIFS_IOC_CHECKUMOUNT:
 			cFYI(1, "User unmount attempted");
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca803365..85cdbf831e7 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,296 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "md5.h"
+
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+	(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+	"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+	md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+	md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+
+static int
+CIFSParseMFSymlink(const u8 *buf,
+		   unsigned int buf_len,
+		   unsigned int *_link_len,
+		   char **_link_str)
+{
+	int rc;
+	unsigned int link_len;
+	const char *md5_str1;
+	const char *link_str;
+	struct MD5Context md5_ctx;
+	u8 md5_hash[16];
+	char md5_str2[34];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+	link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+
+	rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+	if (rc != 1)
+		return -EINVAL;
+
+	cifs_MD5_init(&md5_ctx);
+	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+	cifs_MD5_final(md5_hash, &md5_ctx);
+
+	snprintf(md5_str2, sizeof(md5_str2),
+		 CIFS_MF_SYMLINK_MD5_FORMAT,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	if (strncmp(md5_str1, md5_str2, 17) != 0)
+		return -EINVAL;
+
+	if (_link_str) {
+		*_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+		if (!*_link_str)
+			return -ENOMEM;
+	}
+
+	*_link_len = link_len;
+	return 0;
+}
+
+static int
+CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+	unsigned int link_len;
+	unsigned int ofs;
+	struct MD5Context md5_ctx;
+	u8 md5_hash[16];
+
+	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EINVAL;
+
+	link_len = strlen(link_str);
+
+	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+		return -ENAMETOOLONG;
+
+	cifs_MD5_init(&md5_ctx);
+	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+	cifs_MD5_final(md5_hash, &md5_ctx);
+
+	snprintf(buf, buf_len,
+		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+		 link_len,
+		 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+
+	ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+	memcpy(buf + ofs, link_str, link_len);
+
+	ofs += link_len;
+	if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = '\n';
+		ofs++;
+	}
+
+	while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+		buf[ofs] = ' ';
+		ofs++;
+	}
+
+	return 0;
+}
+
+static int
+CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+		    const char *fromName, const char *toName,
+		    const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	unsigned int bytes_written = 0;
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
+			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
+			 nls_codepage, remap);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSSMBWrite(xid, tcon, netfid,
+			  CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			  0 /* offset */,
+			  &bytes_written, buf, NULL, 0);
+	CIFSSMBClose(xid, tcon, netfid);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+		return -EIO;
+
+	return 0;
+}
+
+static int
+CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+		   const unsigned char *searchName, char **symlinkinfo,
+		   const struct nls_table *nls_codepage, int remap)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	FILE_ALL_INFO file_info;
+
+	rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 nls_codepage, remap);
+	if (rc != 0)
+		return rc;
+
+	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+		CIFSSMBClose(xid, tcon, netfid);
+		/* it's not a symlink */
+		return -EINVAL;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	pbuf = buf;
+
+	rc = CIFSSMBRead(xid, tcon, netfid,
+			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			 0 /* offset */,
+			 &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, tcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		return rc;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+	kfree(buf);
+	if (rc != 0)
+		return rc;
+
+	return 0;
+}
+
+bool
+CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+{
+	if (!(fattr->cf_mode & S_IFREG))
+		/* it's not a symlink */
+		return false;
+
+	if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+		/* it's not a symlink */
+		return false;
+
+	return true;
+}
+
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+		   const unsigned char *path,
+		   struct cifs_sb_info *cifs_sb, int xid)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid = 0;
+	struct tcon_link *tlink;
+	struct cifsTconInfo *pTcon;
+	u8 *buf;
+	char *pbuf;
+	unsigned int bytes_read = 0;
+	int buf_type = CIFS_NO_BUFFER;
+	unsigned int link_len = 0;
+	FILE_ALL_INFO file_info;
+
+	if (!CIFSCouldBeMFSymlink(fattr))
+		/* it's not a symlink */
+		return 0;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+			 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+			 cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
+
+	if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+		CIFSSMBClose(xid, pTcon, netfid);
+		/* it's not a symlink */
+		goto out;
+	}
+
+	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	pbuf = buf;
+
+	rc = CIFSSMBRead(xid, pTcon, netfid,
+			 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+			 0 /* offset */,
+			 &bytes_read, &pbuf, &buf_type);
+	CIFSSMBClose(xid, pTcon, netfid);
+	if (rc != 0) {
+		kfree(buf);
+		goto out;
+	}
+
+	rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+	kfree(buf);
+	if (rc == -EINVAL) {
+		/* it's not a symlink */
+		rc = 0;
+		goto out;
+	}
+
+	if (rc != 0)
+		goto out;
+
+	/* it is a symlink */
+	fattr->cf_eof = link_len;
+	fattr->cf_mode &= ~S_IFMT;
+	fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+	fattr->cf_dtype = DT_LNK;
+out:
+	cifs_put_tlink(tlink);
+	return rc;
+}
 
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
@@ -37,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 	int xid;
 	char *fromName = NULL;
 	char *toName = NULL;
-	struct cifs_sb_info *cifs_sb_target;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct cifsInodeInfo *cifsInode;
 
-	xid = GetXid();
-
-	cifs_sb_target = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb_target->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
 
-/* No need to check for cross device links since server will do that
-   BB note DFS case in future though (when we may have to check) */
+	xid = GetXid();
 
 	fromName = build_path_from_dentry(old_file);
 	toName = build_path_from_dentry(direntry);
@@ -56,16 +346,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 		goto cifs_hl_exit;
 	}
 
-/*	if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
 	if (pTcon->unix_ext)
 		rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
-					    cifs_sb_target->local_nls,
-					    cifs_sb_target->mnt_cifs_flags &
+					    cifs_sb->local_nls,
+					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else {
 		rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
-					cifs_sb_target->local_nls,
-					cifs_sb_target->mnt_cifs_flags &
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if ((rc == -EIO) || (rc == -EINVAL))
 			rc = -EOPNOTSUPP;
@@ -101,6 +390,7 @@ cifs_hl_exit:
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -113,10 +403,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	char *full_path = NULL;
 	char *target_path = NULL;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct tcon_link *tlink = NULL;
+	struct cifsTconInfo *tcon;
 
 	xid = GetXid();
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		tlink = NULL;
+		goto out;
+	}
+	tcon = tlink_tcon(tlink);
+
 	/*
 	 * For now, we just handle symlinks with unix extensions enabled.
 	 * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +429,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	 * but there doesn't seem to be any harm in allowing the client to
 	 * read them.
 	 */
-	if (!(tcon->ses->capabilities & CAP_UNIX)) {
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+	    && !(tcon->ses->capabilities & CAP_UNIX)) {
 		rc = -EACCES;
 		goto out;
 	}
@@ -141,8 +441,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
 
-	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
-				     cifs_sb->local_nls);
+	rc = -EACCES;
+	/*
+	 * First try Minshall+French Symlinks, if configured
+	 * and fallback to UNIX Extensions Symlinks.
+	 */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
+		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+					     cifs_sb->local_nls);
+
 	kfree(full_path);
 out:
 	if (rc != 0) {
@@ -151,6 +464,8 @@ out:
 	}
 
 	FreeXid(xid);
+	if (tlink)
+		cifs_put_tlink(tlink);
 	nd_set_link(nd, target_path);
 	return NULL;
 }
@@ -160,29 +475,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 {
 	int rc = -EOPNOTSUPP;
 	int xid;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
+		goto symlink_exit;
+	}
+	pTcon = tlink_tcon(tlink);
 
 	full_path = build_path_from_dentry(direntry);
-
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto symlink_exit;
 	}
 
 	cFYI(1, "Full path: %s", full_path);
 	cFYI(1, "symname is %s", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
-	if (pTcon->unix_ext)
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+		rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+					 cifs_sb->local_nls,
+					 cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else if (pTcon->unix_ext)
 		rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
 					   cifs_sb->local_nls);
 	/* else
@@ -208,8 +531,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 			d_instantiate(direntry, newinode);
 		}
 	}
-
+symlink_exit:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	FreeXid(xid);
 	return rc;
 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1394aa37f26..c4e296fe351 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 				if (current_fsuid() != treeCon->ses->linux_uid) {
 					cFYI(1, "Multiuser mode and UID "
 						 "did not match tcon uid");
-					read_lock(&cifs_tcp_ses_lock);
+					spin_lock(&cifs_tcp_ses_lock);
 					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
 						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 							}
 						}
 					}
-					read_unlock(&cifs_tcp_ses_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
 				}
 			}
 		}
@@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	struct cifsTconInfo *tcon;
 	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
-	int rc;
 
 	cFYI(1, "Checking for oplock break or dnotify response");
 	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -552,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		return false;
 
 	/* look up tcon based on tid & uid */
-	read_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &srv->smb_ses_list) {
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		list_for_each(tmp1, &ses->tcon_list) {
@@ -561,46 +560,41 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				continue;
 
 			cifs_stats_inc(&tcon->num_oplock_brks);
-			read_lock(&GlobalSMBSeslock);
+			spin_lock(&cifs_file_list_lock);
 			list_for_each(tmp2, &tcon->openFileList) {
 				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
 				if (pSMB->Fid != netfile->netfid)
 					continue;
 
-				/*
-				 * don't do anything if file is about to be
-				 * closed anyway.
-				 */
-				if (netfile->closePend) {
-					read_unlock(&GlobalSMBSeslock);
-					read_unlock(&cifs_tcp_ses_lock);
-					return true;
-				}
-
 				cFYI(1, "file id match, oplock break");
-				pCifsInode = CIFS_I(netfile->pInode);
+				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 				pCifsInode->clientCanCacheAll = false;
 				if (pSMB->OplockLevel == 0)
 					pCifsInode->clientCanCacheRead = false;
-				rc = slow_work_enqueue(&netfile->oplock_break);
-				if (rc) {
-					cERROR(1, "failed to enqueue oplock "
-						   "break: %d\n", rc);
-				} else {
-					netfile->oplock_break_cancelled = false;
-				}
-				read_unlock(&GlobalSMBSeslock);
-				read_unlock(&cifs_tcp_ses_lock);
+
+				/*
+				 * cifs_oplock_break_put() can't be called
+				 * from here.  Get reference after queueing
+				 * succeeded.  cifs_oplock_break() will
+				 * synchronize using cifs_file_list_lock.
+				 */
+				if (queue_work(system_nrt_wq,
+					       &netfile->oplock_break))
+					cifs_oplock_break_get(netfile);
+				netfile->oplock_break_cancelled = false;
+
+				spin_unlock(&cifs_file_list_lock);
+				spin_unlock(&cifs_tcp_ses_lock);
 				return true;
 			}
-			read_unlock(&GlobalSMBSeslock);
-			read_unlock(&cifs_tcp_ses_lock);
+			spin_unlock(&cifs_file_list_lock);
+			spin_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, "No matching file for oplock break");
 			return true;
 		}
 	}
-	read_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 	cFYI(1, "Can not process oplock break for non-existent connection");
 	return true;
 }
@@ -725,6 +719,6 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 			   "properly. Hardlinks will not be recognized on this "
 			   "mount. Consider mounting with the \"noserverino\" "
 			   "option to silence this message.",
-			   cifs_sb->tcon->treeName);
+			   cifs_sb_master_tcon(cifs_sb)->treeName);
 	}
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d35d52889cb..9aad47a2d62 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ERRremcd, -EACCES},
 	{ERRdiffdevice, -EXDEV},
 	{ERRnofiles, -ENOENT},
+	{ERRwriteprot, -EROFS},
 	{ERRbadshare, -ETXTBSY},
 	{ERRlock, -EACCES},
 	{ERRunsup, -EINVAL},
@@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
  * Returns 0 on failure.
  */
 static int
-cifs_inet_pton(const int address_family, const char *cp, void *dst)
+cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
 {
 	int ret = 0;
 
 	/* calculate length by finding first slash or NULL */
 	if (address_family == AF_INET)
-		ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
+		ret = in4_pton(cp, len, dst, '\\', NULL);
 	else if (address_family == AF_INET6)
-		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+		ret = in6_pton(cp, len, dst , '\\', NULL);
 
-	cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
+	cFYI(DBG2, "address conversion returned %d for %*.*s",
+	     ret, len, len, cp);
 	if (ret > 0)
 		ret = 1;
 	return ret;
@@ -164,43 +166,70 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
  * Returns 0 on failure.
  */
 int
-cifs_convert_address(char *src, void *dst)
+cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
-	int rc;
-	char *pct, *endp;
+	int rc, alen, slen;
+	const char *pct;
+	char *endp, scope_id[13];
 	struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
 	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
 
 	/* IPv4 address */
-	if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
+	if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
 		s4->sin_family = AF_INET;
 		return 1;
 	}
 
-	/* temporarily terminate string */
-	pct = strchr(src, '%');
-	if (pct)
-		*pct = '\0';
-
-	rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
-
-	/* repair temp termination (if any) and make pct point to scopeid */
-	if (pct)
-		*pct++ = '%';
+	/* attempt to exclude the scope ID from the address part */
+	pct = memchr(src, '%', len);
+	alen = pct ? pct - src : len;
 
+	rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
 	if (!rc)
 		return rc;
 
 	s6->sin6_family = AF_INET6;
 	if (pct) {
+		/* grab the scope ID */
+		slen = len - (alen + 1);
+		if (slen <= 0 || slen > 12)
+			return 0;
+		memcpy(scope_id, pct + 1, slen);
+		scope_id[slen] = '\0';
+
 		s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
-		if (!*pct || *endp)
+		if (endp != scope_id + slen)
 			return 0;
 	}
 
 	return rc;
 }
 
+int
+cifs_set_port(struct sockaddr *addr, const unsigned short int port)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)addr)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+		break;
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+int
+cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+		   const unsigned short int port)
+{
+	if (!cifs_convert_address(dst, src, len))
+		return 0;
+	return cifs_set_port(dst, port);
+}
+
 /*****************************************************************************
 convert a NT status code to a dos class/code
  *****************************************************************************/
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e7531..5d52e4a3b1e 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
 
+/* Define AV Pair Field IDs */
+enum av_field_type {
+	NTLMSSP_AV_EOL = 0,
+	NTLMSSP_AV_NB_COMPUTER_NAME,
+	NTLMSSP_AV_NB_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_COMPUTER_NAME,
+	NTLMSSP_AV_DNS_DOMAIN_NAME,
+	NTLMSSP_AV_DNS_TREE_NAME,
+	NTLMSSP_AV_FLAGS,
+	NTLMSSP_AV_TIMESTAMP,
+	NTLMSSP_AV_RESTRICTION,
+	NTLMSSP_AV_TARGET_NAME,
+	NTLMSSP_AV_CHANNEL_BINDINGS
+};
+
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index daf1753af67..ef7bb7b50f5 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 		return NULL;
 	}
 
-	if (CIFS_SB(sb)->tcon->nocase)
+	if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
 		dentry->d_op = &cifs_ci_dentry_ops;
 	else
 		dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
 cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
 		       struct cifs_sb_info *cifs_sb)
 {
-	int offset = cifs_sb->tcon->ses->server->timeAdj;
+	int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 	int len;
 	int oplock = 0;
 	int rc;
-	struct cifsTconInfo *ptcon = cifs_sb->tcon;
+	struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
 	char *tmpbuffer;
 
 	rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 static int initiate_cifs_search(const int xid, struct file *file)
 {
 	int rc = 0;
-	char *full_path;
+	char *full_path = NULL;
 	struct cifsFileInfo *cifsFile;
-	struct cifs_sb_info *cifs_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 
-	if (file->private_data == NULL) {
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	if (file->private_data == NULL)
 		file->private_data =
 			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (file->private_data == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
 	}
 
-	if (file->private_data == NULL)
-		return -ENOMEM;
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = true;
 	cifsFile->srch_inf.endOfSearch = false;
-
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EINVAL;
-
-	pTcon = cifs_sb->tcon;
-	if (pTcon == NULL)
-		return -EINVAL;
+	cifsFile->tlink = cifs_get_tlink(tlink);
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
-
-	if (full_path == NULL)
-		return -ENOMEM;
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto error_exit;
+	}
 
 	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 
@@ -283,7 +284,9 @@ ffirst_retry:
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
 		goto ffirst_retry;
 	}
+error_exit:
 	kfree(full_path);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
@@ -525,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, "search backing up - close and restart search");
-		write_lock(&GlobalSMBSeslock);
+		spin_lock(&cifs_file_list_lock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
 			cifsFile->invalidHandle = true;
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		} else
-			write_unlock(&GlobalSMBSeslock);
+			spin_unlock(&cifs_file_list_lock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, "freeing SMB ff cache buf on search rewind");
 			if (cifsFile->srch_inf.smallBuf)
@@ -738,6 +741,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 		cifs_autodisable_serverino(cifs_sb);
 	}
 
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+	    CIFSCouldBeMFSymlink(&fattr))
+		/*
+		 * trying to get the type and mode can be slow,
+		 * so just call those regular files for now, and mark
+		 * for reval
+		 */
+		fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+
 	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
 	tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
 
@@ -777,9 +789,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
-	if (pTcon == NULL)
-		return -EINVAL;
+
+	/*
+	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
+	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
+	 */
+	if (file->private_data == NULL) {
+		rc = initiate_cifs_search(xid, file);
+		cFYI(1, "initiate cifs search rc %d", rc);
+		if (rc)
+			goto rddir2_exit;
+	}
 
 	switch ((int) file->f_pos) {
 	case 0:
@@ -805,14 +825,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			if after then keep searching till find it */
 
 		if (file->private_data == NULL) {
-			rc = initiate_cifs_search(xid, file);
-			cFYI(1, "initiate cifs search rc %d", rc);
-			if (rc) {
-				FreeXid(xid);
-				return rc;
-			}
-		}
-		if (file->private_data == NULL) {
 			rc = -EINVAL;
 			FreeXid(xid);
 			return rc;
@@ -829,6 +841,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		} */
 
+		pTcon = tlink_tcon(cifsFile->tlink);
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
 		if (rc) {
@@ -847,6 +860,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
 
 		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+		if (tmp_buf == NULL) {
+			rc = -ENOMEM;
+			break;
+		}
+
 		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
 			if (current_entry == NULL) {
 				/* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5d..7b01d3f6eed 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
 
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-			 unsigned char *p24);
-
 /*
  * Checks if this is the first smb session to be reconnected after
  * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 	if (max_vcs < 2)
 		max_vcs = 0xFFFF;
 
-	write_lock(&cifs_tcp_ses_lock);
+	spin_lock(&cifs_tcp_ses_lock);
 	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
 			goto get_vc_num_exit;  /* vcnum will be zero */
 	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
 		vcnum = i;
 	ses->vcnum = vcnum;
 get_vc_num_exit:
-	write_unlock(&cifs_tcp_ses_lock);
+	spin_unlock(&cifs_tcp_ses_lock);
 
 	return cpu_to_le16(vcnum);
 }
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 				    struct cifsSesInfo *ses)
 {
+	unsigned int tioffset; /* challenge message target info area */
+	unsigned int tilen; /* challenge message target info area length  */
+
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,11 +399,23 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 		return -EINVAL;
 	}
 
-	memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+	memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
 	/* BB we could decode pblob->NegotiateFlags; some may be useful */
 	/* In particular we can examine sign flags */
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
+	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+	if (tilen) {
+		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cERROR(1, "Challenge target info allocation failure");
+			return -ENOMEM;
+		}
+		memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+		ses->auth_key.len = tilen;
+	}
 
 	return 0;
 }
@@ -425,12 +437,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	/* BB is NTLMV2 session security format easier to use here? */
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
-	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
-		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+		if (!ses->server->session_estab)
+			flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
+				NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	}
 
 	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
 
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
    maximum possible size is fixed and small, making this approach cleaner.
    This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+					u16 *buflen,
 				   struct cifsSesInfo *ses,
-				   const struct nls_table *nls_cp, bool first)
+				   const struct nls_table *nls_cp)
 {
+	int rc;
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
 	unsigned char *tmp;
-	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
 
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
 	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +492,20 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->LmChallengeResponse.Length = 0;
 	sec_blob->LmChallengeResponse.MaximumLength = 0;
 
-	/* calculate session key,  BB what about adding similar ntlmv2 path? */
-	SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-	if (first)
-		cifs_calculate_mac_key(&ses->server->mac_signing_key,
-				       ntlm_session_key, ses->password);
-
-	memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	sec_blob->NtChallengeResponse.MaximumLength =
-				cpu_to_le16(CIFS_SESS_KEY_SIZE);
+	rc = setup_ntlmv2_rsp(ses, nls_cp);
+	if (rc) {
+		cERROR(1, "Error %d during NTLMSSP authentication", rc);
+		goto setup_ntlmv2_ret;
+	}
+	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+			ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
 
-	tmp += CIFS_SESS_KEY_SIZE;
+	sec_blob->NtChallengeResponse.Length =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+	sec_blob->NtChallengeResponse.MaximumLength =
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
 
 	if (ses->domainName == NULL) {
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
 		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
 		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,10 +544,23 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->WorkstationName.MaximumLength = 0;
 	tmp += 2;
 
-	sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->SessionKey.Length = 0;
-	sec_blob->SessionKey.MaximumLength = 0;
-	return tmp - pbuffer;
+	if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+			!calc_seckey(ses)) {
+		memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.MaximumLength =
+				cpu_to_le16(CIFS_CPHTXT_SIZE);
+		tmp += CIFS_CPHTXT_SIZE;
+	} else {
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = 0;
+		sec_blob->SessionKey.MaximumLength = 0;
+	}
+
+setup_ntlmv2_ret:
+	*buflen = tmp - pbuffer;
+	return rc;
 }
 
 
@@ -545,19 +572,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 
 	return;
 }
-
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
-				  struct cifsSesInfo *ses,
-				  const struct nls_table *nls, bool first_time)
-{
-	int bloblen;
-
-	bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
-					  first_time);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
-
-	return bloblen;
-}
 #endif
 
 int
@@ -579,18 +593,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-	bool first_time;
+	u16 blob_len;
+	char *ntlmsspblob = NULL;
 
 	if (ses == NULL)
 		return -EINVAL;
 
-	read_lock(&cifs_tcp_ses_lock);
-	first_time = is_first_ses_reconnect(ses);
-	read_unlock(&cifs_tcp_ses_lock);
-
 	type = ses->server->secType;
-
 	cFYI(1, "sess setup type %d", type);
+	if (type == RawNTLMSSP) {
+		/* if memory allocation is successful, caller of this function
+		 * frees it.
+		 */
+		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+		if (!ses->ntlmssp)
+			return -ENOMEM;
+	}
+
 ssetup_ntlmssp_authenticate:
 	if (phase == NtLmChallenge)
 		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +674,14 @@ ssetup_ntlmssp_authenticate:
 		/* no capabilities flags in old lanman negotiation */
 
 		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-		/* BB calculate hash with password */
-		/* and copy into bcc */
 
-		calc_lanman_hash(ses->password, ses->server->cryptKey,
+		/* Calculate hash with password and copy into bcc_ptr.
+		 * Encryption Key (stored as in cryptkey) gets used if the
+		 * security mode bit in Negottiate Protocol response states
+		 * to use challenge/response method (i.e. Password bit is 1).
+		 */
+
+		calc_lanman_hash(ses->password, ses->server->cryptkey,
 				 ses->server->secMode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
@@ -676,28 +699,27 @@ ssetup_ntlmssp_authenticate:
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
 	} else if (type == NTLM) {
-		char ntlm_session_key[CIFS_SESS_KEY_SIZE];
-
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 		pSMB->req_no_secext.CaseInsensitivePasswordLength =
-			cpu_to_le16(CIFS_SESS_KEY_SIZE);
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(CIFS_SESS_KEY_SIZE);
+			cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 
-		/* calculate session key */
-		SMBNTencrypt(ses->password, ses->server->cryptKey,
-			     ntlm_session_key);
+		/* calculate ntlm response and session key */
+		rc = setup_ntlm_response(ses);
+		if (rc) {
+			cERROR(1, "Error %d during NTLM authentication", rc);
+			goto ssetup_exit;
+		}
 
-		if (first_time) /* should this be moved into common code
-				  with similar ntlmv2 path? */
-			cifs_calculate_mac_key(&ses->server->mac_signing_key,
-				ntlm_session_key, ses->password);
-		/* copy session key */
+		/* copy ntlm response */
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
-		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
-		memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
 			/* unicode strings must be word aligned */
 			if (iov[0].iov_len % 2) {
@@ -708,33 +730,27 @@ ssetup_ntlmssp_authenticate:
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else if (type == NTLMv2) {
-		char *v2_sess_key =
-			kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
-
-		/* BB FIXME change all users of v2_sess_key to
-		   struct ntlmv2_resp */
-
-		if (v2_sess_key == NULL) {
-			rc = -ENOMEM;
-			goto ssetup_exit;
-		}
-
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 
 		/* LM2 password would be here if we supported it */
 		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-		/*	cpu_to_le16(LM2_SESS_KEY_SIZE); */
 
+		/* calculate nlmv2 response and session key */
+		rc = setup_ntlmv2_rsp(ses, nls_cp);
+		if (rc) {
+			cERROR(1, "Error %d during NTLMv2 authentication", rc);
+			goto ssetup_exit;
+		}
+		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+				ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+		bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+		/* set case sensitive password length after tilen may get
+		 * assigned, tilen is 0 otherwise.
+		 */
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(sizeof(struct ntlmv2_resp));
-
-		/* calculate session key */
-		setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-		/* FIXME: calculate MAC key */
-		memcpy(bcc_ptr, (char *)v2_sess_key,
-		       sizeof(struct ntlmv2_resp));
-		bcc_ptr += sizeof(struct ntlmv2_resp);
-		kfree(v2_sess_key);
+			cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
 				*bcc_ptr = 0;
@@ -746,6 +762,7 @@ ssetup_ntlmssp_authenticate:
 	} else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
 		struct cifs_spnego_msg *msg;
+
 		spnego_key = cifs_get_spnego_key(ses);
 		if (IS_ERR(spnego_key)) {
 			rc = PTR_ERR(spnego_key);
@@ -763,19 +780,17 @@ ssetup_ntlmssp_authenticate:
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
-		/* bail out if key is too long */
-		if (msg->sesskey_len >
-		    sizeof(ses->server->mac_signing_key.data.krb5)) {
-			cERROR(1, "Kerberos signing key too long (%u bytes)",
-				msg->sesskey_len);
-			rc = -EOVERFLOW;
+
+		ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+		if (!ses->auth_key.response) {
+			cERROR(1, "Kerberos can't allocate (%u bytes) memory",
+					msg->sesskey_len);
+			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
-		if (first_time) {
-			ses->server->mac_signing_key.len = msg->sesskey_len;
-			memcpy(ses->server->mac_signing_key.data.krb5,
-				msg->data, msg->sesskey_len);
-		}
+		memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
+		ses->auth_key.len = msg->sesskey_len;
+
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -815,12 +830,30 @@ ssetup_ntlmssp_authenticate:
 			if (phase == NtLmNegotiate) {
 				setup_ntlmssp_neg_req(pSMB, ses);
 				iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+				iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			} else if (phase == NtLmAuthenticate) {
-				int blob_len;
-				blob_len = setup_ntlmssp_auth_req(pSMB, ses,
-								  nls_cp,
-								  first_time);
+				/* 5 is an empirical value, large enought to
+				 * hold authenticate message, max 10 of
+				 * av paris, doamin,user,workstation mames,
+				 * flags etc..
+				 */
+				ntlmsspblob = kmalloc(
+					5*sizeof(struct _AUTHENTICATE_MESSAGE),
+					GFP_KERNEL);
+				if (!ntlmsspblob) {
+					cERROR(1, "Can't allocate NTLMSSP");
+					rc = -ENOMEM;
+					goto ssetup_exit;
+				}
+
+				rc = build_ntlmssp_auth_blob(ntlmsspblob,
+							&blob_len, ses, nls_cp);
+				if (rc)
+					goto ssetup_exit;
 				iov[1].iov_len = blob_len;
+				iov[1].iov_base = ntlmsspblob;
+				pSMB->req.SecurityBlobLength =
+					cpu_to_le16(blob_len);
 				/* Make sure that we tell the server that we
 				   are using the uid that it just gave us back
 				   on the response (challenge) */
@@ -830,7 +863,6 @@ ssetup_ntlmssp_authenticate:
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
-			iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			/* unicode strings must be word aligned */
 			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 				*bcc_ptr = 0;
@@ -861,8 +893,6 @@ ssetup_ntlmssp_authenticate:
 			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
-	cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
-
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
 
@@ -895,7 +925,6 @@ ssetup_ntlmssp_authenticate:
 	bcc_ptr = pByteArea(smb_buf);
 
 	if (smb_buf->WordCount == 4) {
-		__u16 blob_len;
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
 			cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +960,8 @@ ssetup_exit:
 		key_put(spnego_key);
 	}
 	kfree(str_area);
+	kfree(ntlmsspblob);
+	ntlmsspblob = NULL;
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7..7f16cb825fe 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
 #define ERRnofiles		18	/* A File Search command can find no
 					   more files matching the specified
 					   criteria. */
+#define ERRwriteprot		19	/* media is write protected */
 #define ERRgeneral		31
 #define ERRbadshare		32	/* The sharing mode specified for an
 					   Open conflicts with existing FIDs on
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d697..e0588cdf4cc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(midQ->resp_buf,
-						&ses->server->mac_signing_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(out_buf,
-						&ses->server->mac_signing_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 				     SECMODE_SIGN_ENABLED))) {
 		rc = cifs_verify_signature(out_buf,
-					   &ses->server->mac_signing_key,
+					   ses->server,
 					   midQ->sequence_number+1);
 		if (rc) {
 			cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa..a264b744bb4 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
-	char *full_path;
+	char *full_path = NULL;
 
 	if (direntry == NULL)
 		return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 	sb = direntry->d_inode->i_sb;
 	if (sb == NULL)
 		return -EIO;
-	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto remove_ea_exit;
 	}
 	if (ea_name == NULL) {
 		cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 remove_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 	sb = direntry->d_inode->i_sb;
 	if (sb == NULL)
 		return -EIO;
-	xid = GetXid();
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto set_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
 		cFYI(1, "size of EA value too large");
-		kfree(full_path);
-		FreeXid(xid);
-		return -EOPNOTSUPP;
+		rc = -EOPNOTSUPP;
+		goto set_ea_exit;
 	}
 
 	if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 set_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	if (sb == NULL)
 		return -EIO;
 
-	xid = GetXid();
-
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto get_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 get_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 #ifdef CONFIG_CIFS_XATTR
 	int xid;
 	struct cifs_sb_info *cifs_sb;
+	struct tcon_link *tlink;
 	struct cifsTconInfo *pTcon;
 	struct super_block *sb;
 	char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 		return -EIO;
 
 	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
-
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 		return -EOPNOTSUPP;
 
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
 	xid = GetXid();
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto list_ea_exit;
 	}
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+list_ea_exit:
 	kfree(full_path);
 	FreeXid(xid);
+	cifs_put_tlink(tlink);
 #endif
 	return rc;
 }
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22..9060f08e70c 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
 {
 	struct coda_inode_info *cii = ITOC(inode);
 
+	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch);
 	if (cii->c_uid != current_fsuid()) {
 		cii->c_uid = current_fsuid();
                 cii->c_cached_perm = mask;
         } else
                 cii->c_cached_perm |= mask;
+	spin_unlock(&cii->c_lock);
 }
 
 /* remove cached acl from an inode */
 void coda_cache_clear_inode(struct inode *inode)
 {
 	struct coda_inode_info *cii = ITOC(inode);
+	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+	spin_unlock(&cii->c_lock);
 }
 
 /* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
 int coda_cache_check(struct inode *inode, int mask)
 {
 	struct coda_inode_info *cii = ITOC(inode);
-        int hit;
+	int hit;
 	
-        hit = (mask & cii->c_cached_perm) == mask &&
-		cii->c_uid == current_fsuid() &&
-		cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_lock(&cii->c_lock);
+	hit = (mask & cii->c_cached_perm) == mask &&
+	    cii->c_uid == current_fsuid() &&
+	    cii->c_cached_epoch == atomic_read(&permission_epoch);
+	spin_unlock(&cii->c_lock);
 
-        return hit;
+	return hit;
 }
 
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929ee..602240569c8 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
 static int coda_test_inode(struct inode *inode, void *data)
 {
 	struct CodaFid *fid = (struct CodaFid *)data;
-	return coda_fideq(&(ITOC(inode)->c_fid), fid);
+	struct coda_inode_info *cii = ITOC(inode);
+	return coda_fideq(&cii->c_fid, fid);
 }
 
 static int coda_set_inode(struct inode *inode, void *data)
 {
 	struct CodaFid *fid = (struct CodaFid *)data;
-	ITOC(inode)->c_fid = *fid;
+	struct coda_inode_info *cii = ITOC(inode);
+	cii->c_fid = *fid;
 	return 0;
 }
 
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
 		cii = ITOC(inode);
 		/* we still need to set i_ino for things like stat(2) */
 		inode->i_ino = hash;
+		/* inode is locked and unique, no need to grab cii->c_lock */
 		cii->c_mapcount = 0;
 		unlock_new_inode(inode);
 	}
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
 }
 
 
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
 void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
 		      struct CodaFid *newfid)
 {
-	struct coda_inode_info *cii;
+	struct coda_inode_info *cii = ITOC(inode);
 	unsigned long hash = coda_f2i(newfid);
 	
-	cii = ITOC(inode);
-
 	BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
 
 	/* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0..5d8b3553960 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		goto exit;
 	}
 
-	lock_kernel();
-
 	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
 			     &type, &resfid);
 	if (!error)
 		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
 
-	unlock_kernel();
-
 	if (error && error != -ENOENT)
 		return ERR_PTR(error);
 
@@ -140,28 +136,24 @@ exit:
 
 int coda_permission(struct inode *inode, int mask)
 {
-        int error = 0;
+	int error;
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
-		return 0; 
+		return 0;
 
 	if ((mask & MAY_EXEC) && !execute_ok(inode))
 		return -EACCES;
 
-	lock_kernel();
-
 	if (coda_cache_check(inode, mask))
-		goto out; 
+		return 0;
 
-        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+	error = venus_access(inode->i_sb, coda_i2f(inode), mask);
     
 	if (!error)
 		coda_cache_enter(inode, mask);
 
- out:
-	unlock_kernel();
 	return error;
 }
 
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 /* creation routines: create, mknod, mkdir, link, symlink */
 static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
-        int error=0;
+	int error;
 	const char *name=de->d_name.name;
 	int length=de->d_name.len;
 	struct inode *inode;
 	struct CodaFid newfid;
 	struct coda_vattr attrs;
 
-	lock_kernel();
-
-	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		unlock_kernel();
+	if (coda_isroot(dir) && coda_iscontrol(name, length))
 		return -EPERM;
-	}
 
 	error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
 				0, mode, &newfid, &attrs);
-
-        if ( error ) {
-		unlock_kernel();
-		d_drop(de);
-		return error;
-	}
+	if (error)
+		goto err_out;
 
 	inode = coda_iget(dir->i_sb, &newfid, &attrs);
-	if ( IS_ERR(inode) ) {
-		unlock_kernel();
-		d_drop(de);
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
 	}
 
 	/* invalidate the directory cnode's attributes */
 	coda_dir_update_mtime(dir);
-	unlock_kernel();
 	d_instantiate(de, inode);
 	return 0;
+err_out:
+	d_drop(de);
+	return error;
 }
 
 static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
 	int error;
 	struct CodaFid newfid;
 
-	lock_kernel();
-
-	if (coda_isroot(dir) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	attrs.va_mode = mode;
 	error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
 			       name, len, &newfid, &attrs);
-        
-        if ( error ) {
-		unlock_kernel();
-		d_drop(de);
-		return error;
-        }
+	if (error)
+		goto err_out;
          
 	inode = coda_iget(dir->i_sb, &newfid, &attrs);
-	if ( IS_ERR(inode) ) {
-		unlock_kernel();
-		d_drop(de);
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_out;
 	}
 
 	/* invalidate the directory cnode's attributes */
 	coda_dir_inc_nlink(dir);
 	coda_dir_update_mtime(dir);
-	unlock_kernel();
 	d_instantiate(de, inode);
 	return 0;
+err_out:
+	d_drop(de);
+	return error;
 }
 
 /* try to make de an entry in dir_inodde linked to source_de */ 
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	int len = de->d_name.len;
 	int error;
 
-	lock_kernel();
-
-	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	error = venus_link(dir_inode->i_sb, coda_i2f(inode),
 			   coda_i2f(dir_inode), (const char *)name, len);
-
 	if (error) {
 		d_drop(de);
-		goto out;
+		return error;
 	}
 
 	coda_dir_update_mtime(dir_inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(de, inode);
 	inc_nlink(inode);
-
-out:
-	unlock_kernel();
-	return(error);
+	return 0;
 }
 
 
 static int coda_symlink(struct inode *dir_inode, struct dentry *de,
 			const char *symname)
 {
-        const char *name = de->d_name.name;
+	const char *name = de->d_name.name;
 	int len = de->d_name.len;
 	int symlen;
-	int error = 0;
-
-	lock_kernel();
+	int error;
 
-	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-		unlock_kernel();
+	if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
 		return -EPERM;
-	}
 
 	symlen = strlen(symname);
-	if ( symlen > CODA_MAXPATHLEN ) {
-		unlock_kernel();
+	if (symlen > CODA_MAXPATHLEN)
 		return -ENAMETOOLONG;
-	}
 
 	/*
 	 * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
 			      symname, symlen);
 
 	/* mtime is no good anymore */
-	if ( !error )
+	if (!error)
 		coda_dir_update_mtime(dir_inode);
 
-	unlock_kernel();
 	return error;
 }
 
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
 	const char *name = de->d_name.name;
 	int len = de->d_name.len;
 
-	lock_kernel();
-
 	error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
-	if ( error ) {
-		unlock_kernel();
+	if (error)
 		return error;
-	}
 
 	coda_dir_update_mtime(dir);
 	drop_nlink(de->d_inode);
-	unlock_kernel();
 	return 0;
 }
 
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 	int error;
 
-	lock_kernel();
-
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
 		/* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 		coda_dir_drop_nlink(dir);
 		coda_dir_update_mtime(dir);
 	}
-	unlock_kernel();
 	return error;
 }
 
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int new_length = new_dentry->d_name.len;
 	int error;
 
-	lock_kernel();
-
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
 			     (const char *) old_name, (const char *)new_name);
-
-	if ( !error ) {
-		if ( new_dentry->d_inode ) {
-			if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
+	if (!error) {
+		if (new_dentry->d_inode) {
+			if (S_ISDIR(new_dentry->d_inode->i_mode)) {
 				coda_dir_drop_nlink(old_dir);
 				coda_dir_inc_nlink(new_dir);
 			}
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 			coda_flag_inode(new_dir, C_VATTR);
 		}
 	}
-	unlock_kernel();
-
 	return error;
 }
 
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 	struct inode *inode = de->d_inode;
 	struct coda_inode_info *cii;
 
-	if (!inode)
-		return 1;
-	lock_kernel();
-	if (coda_isroot(inode))
+	if (!inode || coda_isroot(inode))
 		goto out;
 	if (is_bad_inode(inode))
 		goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 		goto out;
 
 	/* clear the flags. */
+	spin_lock(&cii->c_lock);
 	cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
-
+	spin_unlock(&cii->c_lock);
 bad:
-	unlock_kernel();
 	return 0;
 out:
-	unlock_kernel();
 	return 1;
 }
 
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
 int coda_revalidate_inode(struct dentry *dentry)
 {
 	struct coda_vattr attr;
-	int error = 0;
+	int error;
 	int old_mode;
 	ino_t old_ino;
 	struct inode *inode = dentry->d_inode;
 	struct coda_inode_info *cii = ITOC(inode);
 
-	lock_kernel();
-	if ( !cii->c_flags )
-		goto ok;
+	if (!cii->c_flags)
+		return 0;
 
 	if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
 		error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
-		if ( error )
-			goto return_bad;
+		if (error)
+			return -EIO;
 
 		/* this inode may be lost if:
 		   - it's ino changed 
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
 		/* the following can happen when a local fid is replaced 
 		   with a global one, here we lose and declare the inode bad */
 		if (inode->i_ino != old_ino)
-			goto return_bad;
+			return -EIO;
 		
 		coda_flag_inode_children(inode, C_FLUSH);
+
+		spin_lock(&cii->c_lock);
 		cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+		spin_unlock(&cii->c_lock);
 	}
-
-ok:
-	unlock_kernel();
 	return 0;
-
-return_bad:
-	unlock_kernel();
-	return -EIO;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb..c8b50ba4366 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
 #include <linux/stat.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 
 	coda_inode = coda_file->f_path.dentry->d_inode;
 	host_inode = host_file->f_path.dentry->d_inode;
+
+	cii = ITOC(coda_inode);
+	spin_lock(&cii->c_lock);
 	coda_file->f_mapping = host_file->f_mapping;
 	if (coda_inode->i_mapping == &coda_inode->i_data)
 		coda_inode->i_mapping = host_inode->i_mapping;
 
 	/* only allow additional mmaps as long as userspace isn't changing
 	 * the container file on us! */
-	else if (coda_inode->i_mapping != host_inode->i_mapping)
+	else if (coda_inode->i_mapping != host_inode->i_mapping) {
+		spin_unlock(&cii->c_lock);
 		return -EBUSY;
+	}
 
 	/* keep track of how often the coda_inode/host_file has been mmapped */
-	cii = ITOC(coda_inode);
 	cii->c_mapcount++;
 	cfi->cfi_mapcount++;
+	spin_unlock(&cii->c_lock);
 
 	return host_file->f_op->mmap(host_file, vma);
 }
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	if (!cfi)
 		return -ENOMEM;
 
-	lock_kernel();
-
 	error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
 			   &host_file);
 	if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 
 	if (error) {
 		kfree(cfi);
-		unlock_kernel();
 		return error;
 	}
 
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 
 	BUG_ON(coda_file->private_data != NULL);
 	coda_file->private_data = cfi;
-
-	unlock_kernel();
 	return 0;
 }
 
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	struct coda_file_info *cfi;
 	struct coda_inode_info *cii;
 	struct inode *host_inode;
-	int err = 0;
-
-	lock_kernel();
+	int err;
 
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	cii = ITOC(coda_inode);
 
 	/* did we mmap this file? */
+	spin_lock(&cii->c_lock);
 	if (coda_inode->i_mapping == &host_inode->i_data) {
 		cii->c_mapcount -= cfi->cfi_mapcount;
 		if (!cii->c_mapcount)
 			coda_inode->i_mapping = &coda_inode->i_data;
 	}
+	spin_unlock(&cii->c_lock);
 
 	fput(cfi->cfi_container);
 	kfree(coda_file->private_data);
 	coda_file->private_data = NULL;
 
-	unlock_kernel();
-
 	/* VFS fput ignores the return value from file_operations->release, so
 	 * there is no use returning an error here */
 	return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
 	struct file *host_file;
 	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
 	struct coda_file_info *cfi;
-	int err = 0;
+	int err;
 
 	if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
 	      S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
 	host_file = cfi->cfi_container;
 
 	err = vfs_fsync(host_file, datasync);
-	if ( !err && !datasync ) {
-		lock_kernel();
+	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-		unlock_kernel();
-	}
 
 	return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d97f9935a02..5ea57c8c7f9 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -35,7 +36,7 @@
 #include "coda_int.h"
 
 /* VFS super_block ops */
-static void coda_clear_inode(struct inode *);
+static void coda_evict_inode(struct inode *);
 static void coda_put_super(struct super_block *);
 static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 	ei->c_flags = 0;
 	ei->c_uid = 0;
 	ei->c_cached_perm = 0;
+	spin_lock_init(&ei->c_lock);
 	return &ei->vfs_inode;
 }
 
@@ -93,7 +95,7 @@ static const struct super_operations coda_super_operations =
 {
 	.alloc_inode	= coda_alloc_inode,
 	.destroy_inode	= coda_destroy_inode,
-	.clear_inode	= coda_clear_inode,
+	.evict_inode	= coda_evict_inode,
 	.put_super	= coda_put_super,
 	.statfs		= coda_statfs,
 	.remount_fs	= coda_remount,
@@ -143,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *root = NULL;
-	struct venus_comm *vc = NULL;
+	struct venus_comm *vc;
 	struct CodaFid fid;
 	int error;
 	int idx;
@@ -157,21 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
 
 	vc = &coda_comms[idx];
+	mutex_lock(&vc->vc_mutex);
+
 	if (!vc->vc_inuse) {
 		printk("coda_read_super: No pseudo device\n");
-		return -EINVAL;
+		error = -EINVAL;
+		goto unlock_out;
 	}
 
-        if ( vc->vc_sb ) {
+	if (vc->vc_sb) {
 		printk("coda_read_super: Device already mounted\n");
-		return -EBUSY;
+		error = -EBUSY;
+		goto unlock_out;
 	}
 
 	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
 	if (error)
-		goto bdi_err;
+		goto unlock_out;
 
 	vc->vc_sb = sb;
+	mutex_unlock(&vc->vc_mutex);
 
 	sb->s_fs_info = vc;
 	sb->s_flags |= MS_NOATIME;
@@ -200,32 +207,41 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk("coda_read_super: rootinode is %ld dev %s\n", 
 	       root->i_ino, root->i_sb->s_id);
 	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root)
+	if (!sb->s_root) {
+		error = -EINVAL;
 		goto error;
-        return 0;
+	}
+	return 0;
 
- error:
-	bdi_destroy(&vc->bdi);
- bdi_err:
+error:
 	if (root)
 		iput(root);
-	if (vc)
-		vc->vc_sb = NULL;
 
-	return -EINVAL;
+	mutex_lock(&vc->vc_mutex);
+	bdi_destroy(&vc->bdi);
+	vc->vc_sb = NULL;
+	sb->s_fs_info = NULL;
+unlock_out:
+	mutex_unlock(&vc->vc_mutex);
+	return error;
 }
 
 static void coda_put_super(struct super_block *sb)
 {
-	bdi_destroy(&coda_vcp(sb)->bdi);
-	coda_vcp(sb)->vc_sb = NULL;
+	struct venus_comm *vcp = coda_vcp(sb);
+	mutex_lock(&vcp->vc_mutex);
+	bdi_destroy(&vcp->bdi);
+	vcp->vc_sb = NULL;
 	sb->s_fs_info = NULL;
+	mutex_unlock(&vcp->vc_mutex);
 
 	printk("Coda: Bye bye.\n");
 }
 
-static void coda_clear_inode(struct inode *inode)
+static void coda_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	coda_cache_clear_inode(inode);
 }
 
@@ -243,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
 	struct coda_vattr vattr;
 	int error;
 
-	lock_kernel();
-	
 	memset(&vattr, 0, sizeof(vattr)); 
 
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -254,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
 	/* Venus is responsible for truncating the container-file!!! */
 	error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
 
-	if ( !error ) {
+	if (!error) {
 	        coda_vattr_to_iattr(inode, &vattr); 
 		coda_cache_clear_inode(inode);
 	}
-
-	unlock_kernel();
-
 	return error;
 }
 
@@ -274,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
-	lock_kernel();
-
 	error = venus_statfs(dentry, buf);
 
-	unlock_kernel();
-
 	if (error) {
 		/* fake something like AFS does */
 		buf->f_blocks = 9000000;
@@ -299,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static int coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *coda_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, coda_fill_super);
 }
 
 struct file_system_type coda_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "coda",
-	.get_sb		= coda_get_sb,
+	.mount		= coda_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c..2fd89b5c5c7 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
-#include <linux/smp_lock.h>
-
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -39,6 +37,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
 const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= coda_pioctl,
+	.llseek		= noop_llseek,
 };
 
 /* the coda pioctl inode ops */
@@ -57,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	struct inode *target_inode = NULL;
 	struct coda_inode_info *cnp;
 
-	lock_kernel();
-
 	/* get the Pioctl data arguments from user space */
-	if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
+		return -EINVAL;
 
 	/*
 	 * Look up the pathname. Note that the pathname is in
@@ -75,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 		error = user_lpath(data.path, &path);
 
 	if (error)
-		goto out;
-	else
-		target_inode = path.dentry->d_inode;
+		return error;
+
+	target_inode = path.dentry->d_inode;
 
 	/* return if it is not a Coda inode */
 	if (target_inode->i_sb != inode->i_sb) {
-		path_put(&path);
 		error = -EINVAL;
 		goto out;
 	}
@@ -90,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	cnp = ITOC(target_inode);
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-
-	path_put(&path);
-
 out:
-	unlock_kernel();
+	path_put(&path);
 	return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 66b9cf79c5b..62647a8595e 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
 	unsigned int mask = POLLOUT | POLLWRNORM;
 
 	poll_wait(file, &vcp->vc_waitq, wait);
+	mutex_lock(&vcp->vc_mutex);
 	if (!list_empty(&vcp->vc_pending))
                 mask |= POLLIN | POLLRDNORM;
+	mutex_unlock(&vcp->vc_mutex);
 
 	return mask;
 }
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
-		struct super_block *sb = NULL;
-                union outputArgs *dcbuf;
+		union outputArgs *dcbuf;
 		int size = sizeof(*dcbuf);
 
-		sb = vcp->vc_sb;
-		if ( !sb ) {
-                        count = nbytes;
-                        goto out;
-		}
-
 		if  ( nbytes < sizeof(struct coda_out_hdr) ) {
 		        printk("coda_downcall opc %d uniq %d, not enough!\n",
 			       hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		}
 
 		/* what downcall errors does Venus handle ? */
-		lock_kernel();
-		error = coda_downcall(hdr.opcode, dcbuf, sb);
-		unlock_kernel();
+		error = coda_downcall(vcp, hdr.opcode, dcbuf);
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	}
         
 	/* Look for the message on the processing queue. */
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 	list_for_each(lh, &vcp->vc_processing) {
 		tmp = list_entry(lh, struct upc_req , uc_chain);
 		if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 			break;
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 
 	if (!req) {
 		printk("psdev_write: msg (%d, %d) not found\n", 
@@ -177,15 +170,15 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		nbytes = req->uc_outSize; /* don't have more space! */
 	}
         if (copy_from_user(req->uc_data, buf, nbytes)) {
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 		retval = -EFAULT;
 		goto out;
 	}
 
 	/* adjust outsize. is this useful ?? */
-        req->uc_outSize = nbytes;	
-        req->uc_flags |= REQ_WRITE;
+	req->uc_outSize = nbytes;
+	req->uc_flags |= CODA_REQ_WRITE;
 	count = nbytes;
 
 	/* Convert filedescriptor into a file handle */
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	if (nbytes == 0)
 		return 0;
 
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 
 	add_wait_queue(&vcp->vc_waitq, &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 			retval = -ERESTARTSYS;
 			break;
 		}
+		mutex_unlock(&vcp->vc_mutex);
 		schedule();
+		mutex_lock(&vcp->vc_mutex);
 	}
 
 	set_current_state(TASK_RUNNING);
@@ -254,8 +249,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	        retval = -EFAULT;
         
 	/* If request was not a signal, enqueue and don't free */
-	if (!(req->uc_flags & REQ_ASYNC)) {
-		req->uc_flags |= REQ_READ;
+	if (!(req->uc_flags & CODA_REQ_ASYNC)) {
+		req->uc_flags |= CODA_REQ_READ;
 		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
 		goto out;
 	}
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
 	kfree(req);
 out:
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return (count ? count : retval);
 }
 
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 	if (idx < 0 || idx >= MAX_CODADEVS)
 		return -ENODEV;
 
-	lock_kernel();
-
 	err = -EBUSY;
 	vcp = &coda_comms[idx];
+	mutex_lock(&vcp->vc_mutex);
+
 	if (!vcp->vc_inuse) {
 		vcp->vc_inuse++;
 
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 		err = 0;
 	}
 
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return err;
 }
 
@@ -308,32 +303,32 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 		return -1;
 	}
 
-	lock_kernel();
+	mutex_lock(&vcp->vc_mutex);
 
 	/* Wakeup clients so they can return. */
 	list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
 		list_del(&req->uc_chain);
 
 		/* Async requests need to be freed here */
-		if (req->uc_flags & REQ_ASYNC) {
+		if (req->uc_flags & CODA_REQ_ASYNC) {
 			CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
 			kfree(req);
 			continue;
 		}
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 	}
 
 	list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
 		list_del(&req->uc_chain);
 
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 	}
 
 	file->private_data = NULL;
 	vcp->vc_inuse--;
-	unlock_kernel();
+	mutex_unlock(&vcp->vc_mutex);
 	return 0;
 }
 
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
 	.unlocked_ioctl	= coda_psdev_ioctl,
 	.open		= coda_psdev_open,
 	.release	= coda_psdev_release,
+	.llseek		= noop_llseek,
 };
 
 static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
 	}		
-	for (i = 0; i < MAX_CODADEVS; i++)
+	for (i = 0; i < MAX_CODADEVS; i++) {
+		mutex_init(&(&coda_comms[i])->vc_mutex);
 		device_create(coda_psdev_class, NULL,
 			      MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+	}
 	coda_sysctl_init();
 	goto out;
 
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b725845..af78f007a2b 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	unsigned int len = PAGE_SIZE;
 	char *p = kmap(page);
 
-	lock_kernel();
 	cii = ITOC(inode);
 
 	error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
-	unlock_kernel();
 	if (error)
 		goto fail;
 	SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f09c5ed76f6..c3563cab975 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
@@ -604,9 +605,10 @@ static void coda_unblock_signals(sigset_t *old)
 			       (((r)->uc_opcode != CODA_CLOSE && \
 				 (r)->uc_opcode != CODA_STORE && \
 				 (r)->uc_opcode != CODA_RELEASE) || \
-				(r)->uc_flags & REQ_READ))
+				(r)->uc_flags & CODA_REQ_READ))
 
-static inline void coda_waitfor_upcall(struct upc_req *req)
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+				       struct upc_req *req)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -624,7 +626,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
 		/* got a reply */
-		if (req->uc_flags & (REQ_WRITE | REQ_ABORT))
+		if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
 			break;
 
 		if (blocked && time_after(jiffies, timeout) &&
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
 			break;
 		}
 
+		mutex_unlock(&vcp->vc_mutex);
 		if (blocked)
 			schedule_timeout(HZ);
 		else
 			schedule();
+		mutex_lock(&vcp->vc_mutex);
 	}
 	if (blocked)
 		coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
 {
 	union outputArgs *out;
 	union inputArgs *sig_inputArgs;
-	struct upc_req *req, *sig_req;
-	int error = 0;
+	struct upc_req *req = NULL, *sig_req;
+	int error;
+
+	mutex_lock(&vcp->vc_mutex);
 
 	if (!vcp->vc_inuse) {
 		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
-		return -ENXIO;
+		error = -ENXIO;
+		goto exit;
 	}
 
 	/* Format the request message. */
 	req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
-	if (!req)
-		return -ENOMEM;
+	if (!req) {
+		error = -ENOMEM;
+		goto exit;
+	}
 
 	req->uc_data = (void *)buffer;
 	req->uc_flags = 0;
@@ -705,10 +714,10 @@ static int coda_upcall(struct venus_comm *vcp,
 	 * ENODEV.  */
 
 	/* Go to sleep.  Wake up on signals only after the timeout. */
-	coda_waitfor_upcall(req);
+	coda_waitfor_upcall(vcp, req);
 
 	/* Op went through, interrupt or not... */
-	if (req->uc_flags & REQ_WRITE) {
+	if (req->uc_flags & CODA_REQ_WRITE) {
 		out = (union outputArgs *)req->uc_data;
 		/* here we map positive Venus errors to kernel errors */
 		error = -out->oh.result;
@@ -717,13 +726,13 @@ static int coda_upcall(struct venus_comm *vcp,
 	}
 
 	error = -EINTR;
-	if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) {
+	if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
 		printk(KERN_WARNING "coda: Unexpected interruption.\n");
 		goto exit;
 	}
 
 	/* Interrupted before venus read it. */
-	if (!(req->uc_flags & REQ_READ))
+	if (!(req->uc_flags & CODA_REQ_READ))
 		goto exit;
 
 	/* Venus saw the upcall, make sure we can send interrupt signal */
@@ -747,7 +756,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	sig_inputArgs->ih.opcode = CODA_SIGNAL;
 	sig_inputArgs->ih.unique = req->uc_unique;
 
-	sig_req->uc_flags = REQ_ASYNC;
+	sig_req->uc_flags = CODA_REQ_ASYNC;
 	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
 	sig_req->uc_unique = sig_inputArgs->ih.unique;
 	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
 
 exit:
 	kfree(req);
+	mutex_unlock(&vcp->vc_mutex);
 	return error;
 }
 
@@ -796,21 +806,24 @@ exit:
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
 	struct inode *inode = NULL;
-	struct CodaFid *fid, *newfid;
+	struct CodaFid *fid = NULL, *newfid;
+	struct super_block *sb;
 
 	/* Handle invalidation requests. */
-	if ( !sb || !sb->s_root)
-		return 0;
+	mutex_lock(&vcp->vc_mutex);
+	sb = vcp->vc_sb;
+	if (!sb || !sb->s_root)
+		goto unlock_out;
 
 	switch (opcode) {
 	case CODA_FLUSH:
 		coda_cache_clear_all(sb);
 		shrink_dcache_sb(sb);
 		if (sb->s_root->d_inode)
-		    coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+			coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
 		break;
 
 	case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
 
 	case CODA_ZAPDIR:
 		fid = &out->coda_zapdir.CodaFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode) {
-			coda_flag_inode_children(inode, C_PURGE);
-			coda_flag_inode(inode, C_VATTR);
-		}
 		break;
 
 	case CODA_ZAPFILE:
 		fid = &out->coda_zapfile.CodaFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode)
-			coda_flag_inode(inode, C_VATTR);
 		break;
 
 	case CODA_PURGEFID:
 		fid = &out->coda_purgefid.CodaFid;
+		break;
+
+	case CODA_REPLACE:
+		fid = &out->coda_replace.OldFid;
+		break;
+	}
+	if (fid)
 		inode = coda_fid_to_inode(fid, sb);
-		if (inode) {
-			coda_flag_inode_children(inode, C_PURGE);
 
-			/* catch the dentries later if some are still busy */
-			coda_flag_inode(inode, C_PURGE);
-			d_prune_aliases(inode);
+unlock_out:
+	mutex_unlock(&vcp->vc_mutex);
 
-		}
+	if (!inode)
+		return 0;
+
+	switch (opcode) {
+	case CODA_ZAPDIR:
+		coda_flag_inode_children(inode, C_PURGE);
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_ZAPFILE:
+		coda_flag_inode(inode, C_VATTR);
+		break;
+
+	case CODA_PURGEFID:
+		coda_flag_inode_children(inode, C_PURGE);
+
+		/* catch the dentries later if some are still busy */
+		coda_flag_inode(inode, C_PURGE);
+		d_prune_aliases(inode);
 		break;
 
 	case CODA_REPLACE:
-		fid = &out->coda_replace.OldFid;
 		newfid = &out->coda_replace.NewFid;
-		inode = coda_fid_to_inode(fid, sb);
-		if (inode)
-			coda_replace_fid(inode, fid, newfid);
+		coda_replace_fid(inode, fid, newfid);
 		break;
 	}
-
-	if (inode)
-		iput(inode);
-
+	iput(inode);
 	return 0;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff..c580c322fa6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,13 +8,14 @@
  *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
  *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
  *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *  Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
  */
 
+#include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -28,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -50,6 +49,7 @@
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -76,7 +76,8 @@ int compat_printk(const char *fmt, ...)
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
  */
-asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
+asmlinkage long compat_sys_utime(const char __user *filename,
+				 struct compat_utimbuf __user *t)
 {
 	struct timespec tv[2];
 
@@ -90,7 +91,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
 {
 	struct timespec tv[2];
 
@@ -105,7 +106,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
 {
 	struct timespec tv[2];
 
@@ -124,7 +125,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
 {
 	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
@@ -168,7 +169,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 	return err;
 }
 
-asmlinkage long compat_sys_newstat(char __user * filename,
+asmlinkage long compat_sys_newstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -180,7 +181,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 	return cp_compat_stat(&stat, statbuf);
 }
 
-asmlinkage long compat_sys_newlstat(char __user * filename,
+asmlinkage long compat_sys_newlstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -193,7 +194,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 }
 
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+		const char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -266,7 +268,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(path.dentry, &tmp);
+		error = vfs_statfs(&path, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_put(&path);
@@ -284,7 +286,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_path.dentry, &tmp);
+	error = vfs_statfs(&file->f_path, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -334,7 +336,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(path.dentry, &tmp);
+		error = vfs_statfs(&path, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_put(&path);
@@ -355,7 +357,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_path.dentry, &tmp);
+	error = vfs_statfs(&file->f_path, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
@@ -378,7 +380,7 @@ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
 	sb = user_get_super(new_decode_dev(dev));
 	if (!sb)
 		return -EINVAL;
-	err = vfs_statfs(sb->s_root, &sbuf);
+	err = statfs_by_dentry(sb->s_root, &sbuf);
 	drop_super(sb);
 	if (err)
 		return err;
@@ -605,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	/*
 	 * Single unix specification:
 	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.  The total length is fitting an ssize_t
+	 * ssize_t.
 	 *
-	 * Be careful here because iov_len is a size_t not an ssize_t
+	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
+	 * no overflow possibility.
 	 */
 	tot_len = 0;
 	ret = -EINVAL;
 	for (seg = 0; seg < nr_segs; seg++) {
-		compat_ssize_t tmp = tot_len;
 		compat_uptr_t buf;
 		compat_ssize_t len;
 
@@ -623,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
 		}
 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 			goto out;
-		tot_len += len;
-		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-			goto out;
 		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 			ret = -EFAULT;
 			goto out;
 		}
+		if (len > MAX_RW_COUNT - tot_len)
+			len = MAX_RW_COUNT - tot_len;
+		tot_len += len;
 		iov->iov_base = compat_ptr(buf);
 		iov->iov_len = (compat_size_t) len;
 		uvector++;
@@ -742,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
 	return raw_data;
 }
 
-struct compat_smb_mount_data {
-	compat_int_t version;
-	__compat_uid_t mounted_uid;
-	__compat_uid_t uid;
-	__compat_gid_t gid;
-	compat_mode_t file_mode;
-	compat_mode_t dir_mode;
-};
-
-static void *do_smb_super_data_conv(void *raw_data)
-{
-	struct smb_mount_data *s = raw_data;
-	struct compat_smb_mount_data *c_s = raw_data;
-
-	if (c_s->version != SMB_MOUNT_OLDVERSION)
-		goto out;
-	s->dir_mode = c_s->dir_mode;
-	s->file_mode = c_s->file_mode;
-	s->gid = c_s->gid;
-	s->uid = c_s->uid;
-	s->mounted_uid = c_s->mounted_uid;
- out:
-	return raw_data;
-}
 
 struct compat_nfs_string {
 	compat_uint_t len;
@@ -832,13 +810,13 @@ static int do_nfs4_super_data_conv(void *raw_data)
 	return 0;
 }
 
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
-asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
-				 char __user * type, unsigned long flags,
-				 void __user * data)
+asmlinkage long compat_sys_mount(const char __user * dev_name,
+				 const char __user * dir_name,
+				 const char __user * type, unsigned long flags,
+				 const void __user * data)
 {
 	char *kernel_type;
 	unsigned long data_page;
@@ -866,9 +844,7 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 	retval = -EINVAL;
 
 	if (kernel_type && data_page) {
-		if (!strcmp(kernel_type, SMBFS_NAME)) {
-			do_smb_super_data_conv((void *)data_page);
-		} else if (!strcmp(kernel_type, NCPFS_NAME)) {
+		if (!strcmp(kernel_type, NCPFS_NAME)) {
 			do_ncp_super_data_conv((void *)data_page);
 		} else if (!strcmp(kernel_type, NFS4_NAME)) {
 			if (do_nfs4_super_data_conv((void *) data_page))
@@ -891,8 +867,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 	return retval;
 }
 
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
-
 struct compat_old_linux_dirent {
 	compat_ulong_t	d_ino;
 	compat_ulong_t	d_offset;
@@ -981,7 +955,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
 	struct compat_linux_dirent __user * dirent;
 	struct compat_getdents_callback *buf = __buf;
 	compat_ulong_t d_ino;
-	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
+	int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+		namlen + 2, sizeof(compat_long_t));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -1068,8 +1043,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
 {
 	struct linux_dirent64 __user *dirent;
 	struct compat_getdents_callback64 *buf = __buf;
-	int jj = NAME_OFFSET(dirent);
-	int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
 	u64 off;
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
@@ -1150,7 +1125,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov;
+	struct iovec *iov = iovstack;
 	ssize_t ret;
 	io_fn_t fn;
 	iov_fn_t fnv;
@@ -1193,11 +1168,10 @@ out:
 	if (iov != iovstack)
 		kfree(iov);
 	if ((ret + (type == READ)) > 0) {
-		struct dentry *dentry = file->f_path.dentry;
 		if (type == READ)
-			fsnotify_access(dentry);
+			fsnotify_access(file);
 		else
-			fsnotify_modify(dentry);
+			fsnotify_modify(file);
 	}
 	return ret;
 }
@@ -1961,7 +1935,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
-#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
 	u16			svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae..410ed188faa 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
  * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
  * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
  *
  * These routines maintain argument size conversion between 32bit and 64bit
  * ioctls.
@@ -46,7 +46,6 @@
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -131,23 +130,6 @@ static int w_long(unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int rw_long(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
-{
-	mm_segment_t old_fs = get_fs();
-	int err;
-	unsigned long val;
-
-	if(get_user(val, argp))
-		return -EFAULT;
-	set_fs (KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&val);
-	set_fs (old_fs);
-	if (!err && put_user(val, argp))
-		return -EFAULT;
-	return err;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -575,34 +557,12 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 
 #endif /* CONFIG_BLOCK */
 
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-			compat_uid_t __user *argp)
-{
-	mm_segment_t old_fs = get_fs();
-	__kernel_uid_t kuid;
-	int err;
-
-	cmd = SMB_IOC_GETMOUNTUID;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-	set_fs(old_fs);
-
-	if (err >= 0)
-		err = put_user(kuid, argp);
-
-	return err;
-}
-
-static int ioc_settimeout(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
-{
-	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
-}
-
 /* Bluetooth ioctls */
-#define HCIUARTSETPROTO	_IOW('U', 200, int)
-#define HCIUARTGETPROTO	_IOR('U', 201, int)
+#define HCIUARTSETPROTO		_IOW('U', 200, int)
+#define HCIUARTGETPROTO		_IOR('U', 201, int)
+#define HCIUARTGETDEVICE	_IOR('U', 202, int)
+#define HCIUARTSETFLAGS		_IOW('U', 203, int)
+#define HCIUARTGETFLAGS		_IOR('U', 204, int)
 
 #define BNEPCONNADD	_IOW('B', 200, int)
 #define BNEPCONNDEL	_IOW('B', 201, int)
@@ -619,69 +579,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
-#ifdef CONFIG_BLOCK
-struct raw32_config_request
-{
-        compat_int_t    raw_minor;
-        __u64   block_major;
-        __u64   block_minor;
-} __attribute__((packed));
-
-static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-
-        if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-
-        ret = __get_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __get_user(req->block_major, &user_req->block_major);
-        ret |= __get_user(req->block_minor, &user_req->block_minor);
-
-        return ret ? -EFAULT : 0;
-}
-
-static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-	int ret;
-
-        if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-
-        ret = __put_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __put_user(req->block_major, &user_req->block_major);
-        ret |= __put_user(req->block_minor, &user_req->block_minor);
-
-        return ret ? -EFAULT : 0;
-}
-
-static int raw_ioctl(unsigned fd, unsigned cmd,
-		struct raw32_config_request __user *user_req)
-{
-        int ret;
-
-        switch (cmd) {
-        case RAW_SETBIND:
-	default: {	/* RAW_GETBIND */
-                struct raw_config_request req;
-                mm_segment_t oldfs = get_fs();
-
-                if ((ret = get_raw32_request(&req, user_req)))
-                        return ret;
-
-                set_fs(KERNEL_DS);
-                ret = sys_ioctl(fd,cmd,(unsigned long)&req);
-                set_fs(oldfs);
-
-                if ((!ret) && (cmd == RAW_GETBIND)) {
-                        ret = set_raw32_request(&req, user_req);
-                }
-                break;
-        }
-        }
-        return ret;
-}
-#endif /* CONFIG_BLOCK */
 
 struct serial_struct32 {
         compat_int_t    type;
@@ -966,6 +863,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
+COMPATIBLE_IOCTL(TIOCSIG)
 #ifdef TCGETS2
 COMPATIBLE_IOCTL(TCGETS2)
 COMPATIBLE_IOCTL(TCSETS2)
@@ -1281,18 +1179,9 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
 COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* AUTOFS */
-COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
-/* SMB ioctls which do not need any translations */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1328,6 +1217,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
 COMPATIBLE_IOCTL(HCISETLINKMODE)
 COMPATIBLE_IOCTL(HCISETACLMTU)
 COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
@@ -1547,18 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case MTIOCGET32:
 	case MTIOCPOS32:
 		return mt_ioctl_trans(fd, cmd, argp);
-	/* Raw devices */
-	case RAW_SETBIND:
-	case RAW_GETBIND:
-		return raw_ioctl(fd, cmd, argp);
 #endif
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
-	case AUTOFS_IOC_SETTIMEOUT32:
-		return ioc_settimeout(fd, cmd, argp);
-	/* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-	case SMB_IOC_GETMOUNTUID_32:
-		return do_smb_getmountuid(fd, cmd, argp);
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
@@ -1609,9 +1489,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case KDSKBMETA:
 	case KDSKBLED:
 	case KDSETLED:
-	/* AUTOFS */
-	case AUTOFS_IOC_READY:
-	case AUTOFS_IOC_FAIL:
 	/* NBD */
 	case NBD_SET_SOCK:
 	case NBD_SET_BLKSIZE:
@@ -1729,8 +1606,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 				goto out_fput;
 		}
 
-		if (!filp->f_op ||
-		    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
+		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6..253476d78ed 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2..7d3607febe1 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
+	return mount_single(fs_type, flags, data, configfs_fill_super);
 }
 
 static struct file_system_type configfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "configfs",
-	.get_sb		= configfs_get_sb,
+	.mount		= configfs_do_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index dd3634e4c96..32fd5fe9ca0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,66 +39,55 @@ static DEFINE_MUTEX(read_mutex);
 #define CRAMINO(x)	(((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)	((x)->i_ino)
 
-
-static int cramfs_iget5_test(struct inode *inode, void *opaque)
-{
-	struct cramfs_inode *cramfs_inode = opaque;
-	return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
-}
-
-static int cramfs_iget5_set(struct inode *inode, void *opaque)
+static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
 {
-	struct cramfs_inode *cramfs_inode = opaque;
-	inode->i_ino = CRAMINO(cramfs_inode);
-	return 0;
+	static struct timespec zerotime;
+	inode->i_mode = cramfs_inode->mode;
+	inode->i_uid = cramfs_inode->uid;
+	inode->i_size = cramfs_inode->size;
+	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+	inode->i_gid = cramfs_inode->gid;
+	/* Struct copy intentional */
+	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+	/* inode->i_nlink is left 1 - arguably wrong for directories,
+	   but it's the best we can do without reading the directory
+	   contents.  1 yields the right result in GNU find, even
+	   without -noleaf option. */
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		inode->i_data.a_ops = &cramfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &cramfs_dir_inode_operations;
+		inode->i_fop = &cramfs_directory_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &cramfs_aops;
+	} else {
+		init_special_inode(inode, inode->i_mode,
+			old_decode_dev(cramfs_inode->size));
+	}
 }
 
 static struct inode *get_cramfs_inode(struct super_block *sb,
 				struct cramfs_inode * cramfs_inode)
 {
-	struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
-					    cramfs_iget5_test, cramfs_iget5_set,
-					    cramfs_inode);
-	static struct timespec zerotime;
-
-	if (inode && (inode->i_state & I_NEW)) {
-		inode->i_mode = cramfs_inode->mode;
-		inode->i_uid = cramfs_inode->uid;
-		inode->i_size = cramfs_inode->size;
-		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-		inode->i_gid = cramfs_inode->gid;
-		/* Struct copy intentional */
-		inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
-		/* inode->i_nlink is left 1 - arguably wrong for directories,
-		   but it's the best we can do without reading the directory
-		   contents.  1 yields the right result in GNU find, even
-		   without -noleaf option. */
-		if (S_ISREG(inode->i_mode)) {
-			inode->i_fop = &generic_ro_fops;
-			inode->i_data.a_ops = &cramfs_aops;
-		} else if (S_ISDIR(inode->i_mode)) {
-			inode->i_op = &cramfs_dir_inode_operations;
-			inode->i_fop = &cramfs_directory_operations;
-		} else if (S_ISLNK(inode->i_mode)) {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_data.a_ops = &cramfs_aops;
-		} else {
-			init_special_inode(inode, inode->i_mode,
-				old_decode_dev(cramfs_inode->size));
+	struct inode *inode;
+	if (CRAMINO(cramfs_inode) == 1) {
+		inode = new_inode(sb);
+		if (inode) {
+			inode->i_ino = 1;
+			setup_inode(inode, cramfs_inode);
+		}
+	} else {
+		inode = iget_locked(sb, CRAMINO(cramfs_inode));
+		if (inode && (inode->i_state & I_NEW)) {
+			setup_inode(inode, cramfs_inode);
+			unlock_new_inode(inode);
 		}
-		unlock_new_inode(inode);
 	}
 	return inode;
 }
 
-static void cramfs_drop_inode(struct inode *inode)
-{
-	if (inode->i_ino == 1)
-		generic_delete_inode(inode);
-	else
-		generic_drop_inode(inode);
-}
-
 /*
  * We have our own block cache: don't fill up the buffer cache
  * with the rom-image, because the way the filesystem is set
@@ -542,20 +531,18 @@ static const struct super_operations cramfs_ops = {
 	.put_super	= cramfs_put_super,
 	.remount_fs	= cramfs_remount,
 	.statfs		= cramfs_statfs,
-	.drop_inode	= cramfs_drop_inode,
 };
 
-static int cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
 }
 
 static struct file_system_type cramfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "cramfs",
-	.get_sb		= cramfs_get_sb,
+	.mount		= cramfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index 86d4db15473..23702a9d4e6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static void __d_free(struct dentry *dentry)
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+		   size_t *lenp, loff_t *ppos)
+{
+	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static void __d_free(struct rcu_head *head)
 {
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+
 	WARN_ON(!list_empty(&dentry->d_alias));
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
-static void d_callback(struct rcu_head *head)
-{
-	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-	__d_free(dentry);
-}
-
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
- * inside dcache_lock.
+ * no dcache_lock, please.
  */
 static void d_free(struct dentry *dentry)
 {
+	percpu_counter_dec(&nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
+
 	/* if dentry was never inserted into hash, immediate free is OK */
 	if (hlist_unhashed(&dentry->d_hash))
-		__d_free(dentry);
+		__d_free(&dentry->d_u.d_rcu);
 	else
-		call_rcu(&dentry->d_u.d_rcu, d_callback);
+		call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 
 /*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
  */
 static void dentry_lru_add(struct dentry *dentry)
 {
-	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-	dentry->d_sb->s_nr_dentry_unused++;
-	dentry_stat.nr_unused++;
-}
-
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
-	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-	dentry->d_sb->s_nr_dentry_unused++;
-	dentry_stat.nr_unused++;
+	if (list_empty(&dentry->d_lru)) {
+		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		percpu_counter_inc(&nr_dentry_unused);
+	}
 }
 
 static void dentry_lru_del(struct dentry *dentry)
 {
 	if (!list_empty(&dentry->d_lru)) {
-		list_del(&dentry->d_lru);
+		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		dentry_stat.nr_unused--;
+		percpu_counter_dec(&nr_dentry_unused);
 	}
 }
 
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-	if (likely(!list_empty(&dentry->d_lru))) {
-		list_del_init(&dentry->d_lru);
-		dentry->d_sb->s_nr_dentry_unused--;
-		dentry_stat.nr_unused--;
+	if (list_empty(&dentry->d_lru)) {
+		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+		dentry->d_sb->s_nr_dentry_unused++;
+		percpu_counter_inc(&nr_dentry_unused);
+	} else {
+		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
 }
 
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
 	struct dentry *parent;
 
 	list_del(&dentry->d_u.d_child);
-	dentry_stat.nr_dentry--;	/* For d_free, below */
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
 	if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
 		if (dentry->d_op->d_delete(dentry))
 			goto unhash_it;
 	}
+
 	/* Unreachable? Get rid of it */
  	if (d_unhashed(dentry))
 		goto kill_it;
-  	if (list_empty(&dentry->d_lru)) {
-  		dentry->d_flags |= DCACHE_REFERENCED;
-		dentry_lru_add(dentry);
-  	}
+
+	/* Otherwise leave it cached and ensure it's on the LRU */
+	dentry->d_flags |= DCACHE_REFERENCED;
+	dentry_lru_add(dentry);
+
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 
 /* This should be called _only_ with dcache_lock held */
-
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	dentry_lru_del_init(dentry);
+	dentry_lru_del(dentry);
 	return dentry;
 }
 
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
 
 		if (dentry->d_op && dentry->d_op->d_delete)
 			dentry->d_op->d_delete(dentry);
-		dentry_lru_del_init(dentry);
+		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
 		spin_lock(&dcache_lock);
 	}
 }
 
-/*
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+static void shrink_dentry_list(struct list_head *list)
 {
-	LIST_HEAD(referenced);
-	LIST_HEAD(tmp);
 	struct dentry *dentry;
-	int cnt = 0;
 
-	BUG_ON(!sb);
-	BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
-	spin_lock(&dcache_lock);
-	if (count != NULL)
-		/* called from prune_dcache() and shrink_dcache_parent() */
-		cnt = *count;
-restart:
-	if (count == NULL)
-		list_splice_init(&sb->s_dentry_lru, &tmp);
-	else {
-		while (!list_empty(&sb->s_dentry_lru)) {
-			dentry = list_entry(sb->s_dentry_lru.prev,
-					struct dentry, d_lru);
-			BUG_ON(dentry->d_sb != sb);
+	while (!list_empty(list)) {
+		dentry = list_entry(list->prev, struct dentry, d_lru);
+		dentry_lru_del(dentry);
 
-			spin_lock(&dentry->d_lock);
-			/*
-			 * If we are honouring the DCACHE_REFERENCED flag and
-			 * the dentry has this flag set, don't free it. Clear
-			 * the flag and put it back on the LRU.
-			 */
-			if ((flags & DCACHE_REFERENCED)
-				&& (dentry->d_flags & DCACHE_REFERENCED)) {
-				dentry->d_flags &= ~DCACHE_REFERENCED;
-				list_move(&dentry->d_lru, &referenced);
-				spin_unlock(&dentry->d_lock);
-			} else {
-				list_move_tail(&dentry->d_lru, &tmp);
-				spin_unlock(&dentry->d_lock);
-				cnt--;
-				if (!cnt)
-					break;
-			}
-			cond_resched_lock(&dcache_lock);
-		}
-	}
-	while (!list_empty(&tmp)) {
-		dentry = list_entry(tmp.prev, struct dentry, d_lru);
-		dentry_lru_del_init(dentry);
-		spin_lock(&dentry->d_lock);
 		/*
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
+		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count)) {
 			spin_unlock(&dentry->d_lock);
 			continue;
@@ -516,13 +477,60 @@ restart:
 		/* dentry->d_lock was dropped in prune_one_dentry() */
 		cond_resched_lock(&dcache_lock);
 	}
-	if (count == NULL && !list_empty(&sb->s_dentry_lru))
-		goto restart;
-	if (count != NULL)
-		*count = cnt;
+}
+
+/**
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:		superblock to shrink dentry LRU.
+ * @count:	number of entries to prune
+ * @flags:	flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+	/* called from prune_dcache() and shrink_dcache_parent() */
+	struct dentry *dentry;
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	int cnt = *count;
+
+	spin_lock(&dcache_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		dentry = list_entry(sb->s_dentry_lru.prev,
+				struct dentry, d_lru);
+		BUG_ON(dentry->d_sb != sb);
+
+		/*
+		 * If we are honouring the DCACHE_REFERENCED flag and the
+		 * dentry has this flag set, don't free it.  Clear the flag
+		 * and put it back on the LRU.
+		 */
+		if (flags & DCACHE_REFERENCED) {
+			spin_lock(&dentry->d_lock);
+			if (dentry->d_flags & DCACHE_REFERENCED) {
+				dentry->d_flags &= ~DCACHE_REFERENCED;
+				list_move(&dentry->d_lru, &referenced);
+				spin_unlock(&dentry->d_lock);
+				cond_resched_lock(&dcache_lock);
+				continue;
+			}
+			spin_unlock(&dentry->d_lock);
+		}
+
+		list_move_tail(&dentry->d_lru, &tmp);
+		if (!--cnt)
+			break;
+		cond_resched_lock(&dcache_lock);
+	}
+
+	*count = cnt;
+	shrink_dentry_list(&tmp);
+
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lock);
+
 }
 
 /**
@@ -536,9 +544,9 @@ restart:
  */
 static void prune_dcache(int count)
 {
-	struct super_block *sb, *n;
+	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = dentry_stat.nr_unused;
+	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
 	int prune_ratio;
 	int pruned;
 
@@ -550,7 +558,7 @@ static void prune_dcache(int count)
 	else
 		prune_ratio = unused / count;
 	spin_lock(&sb_lock);
-	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (list_empty(&sb->s_instances))
 			continue;
 		if (sb->s_nr_dentry_unused == 0)
@@ -590,14 +598,16 @@ static void prune_dcache(int count)
 			up_read(&sb->s_umount);
 		}
 		spin_lock(&sb_lock);
-		/* lock was dropped, must reset next */
-		list_safe_reset_next(sb, n, s_list);
+		if (p)
+			__put_super(p);
 		count -= pruned;
-		__put_super(sb);
+		p = sb;
 		/* more work left to do? */
 		if (count <= 0)
 			break;
 	}
+	if (p)
+		__put_super(p);
 	spin_unlock(&sb_lock);
 	spin_unlock(&dcache_lock);
 }
@@ -606,13 +616,19 @@ static void prune_dcache(int count)
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
  *
- * Shrink the dcache for the specified super block. This
- * is used to free the dcache before unmounting a file
- * system
+ * Shrink the dcache for the specified super block. This is used to free
+ * the dcache before unmounting a file system.
  */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-	__shrink_dcache_sb(sb, NULL, 0);
+	LIST_HEAD(tmp);
+
+	spin_lock(&dcache_lock);
+	while (!list_empty(&sb->s_dentry_lru)) {
+		list_splice_init(&sb->s_dentry_lru, &tmp);
+		shrink_dentry_list(&tmp);
+	}
+	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -630,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 	/* detach this root from the system */
 	spin_lock(&dcache_lock);
-	dentry_lru_del_init(dentry);
+	dentry_lru_del(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
@@ -644,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			spin_lock(&dcache_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
-				dentry_lru_del_init(loop);
+				dentry_lru_del(loop);
 				__d_drop(loop);
 				cond_resched_lock(&dcache_lock);
 			}
@@ -701,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			 * otherwise we ascend to the parent and move to the
 			 * next sibling if there is one */
 			if (!parent)
-				goto out;
-
+				return;
 			dentry = parent;
-
 		} while (list_empty(&dentry->d_subdirs));
 
 		dentry = list_entry(dentry->d_subdirs.next,
 				    struct dentry, d_u.d_child);
 	}
-out:
-	/* several dentries were freed, need to correct nr_dentry */
-	spin_lock(&dcache_lock);
-	dentry_stat.nr_dentry -= detached;
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -828,14 +837,15 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
-		dentry_lru_del_init(dentry);
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			dentry_lru_add_tail(dentry);
+			dentry_lru_move_tail(dentry);
 			found++;
+		} else {
+			dentry_lru_del(dentry);
 		}
 
 		/*
@@ -898,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+	int nr_unused;
+
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_dcache(nr);
 	}
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+
+	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -970,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	spin_lock(&dcache_lock);
 	if (parent)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
+	percpu_counter_inc(&nr_dentry);
+
 	return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
@@ -1330,31 +1345,13 @@ EXPORT_SYMBOL(d_add_ci);
  * d_lookup - search for a dentry
  * @parent: parent dentry
  * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
  *
- * Searches the children of the parent dentry for the name in question. If
- * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal. 
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
+ * d_lookup searches the children of the parent dentry for the name in
+ * question. If the dentry is found its reference count is incremented and the
+ * dentry is returned. The caller must use dput to free the entry when it has
+ * finished using it. %NULL is returned if the dentry does not exist.
  */
-
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
 	struct dentry * dentry = NULL;
@@ -1370,6 +1367,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
 
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
 	unsigned int len = name->len;
@@ -1380,6 +1392,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 	struct hlist_node *node;
 	struct dentry *dentry;
 
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Take d_lock when comparing a candidate dentry, to avoid races
+	 * with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 */
 	rcu_read_lock();
 	
 	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1394,8 +1419,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 
 		/*
 		 * Recheck the dentry after taking the lock - d_move may have
-		 * changed things.  Don't bother checking the hash because we're
-		 * about to compare the whole name anyway.
+		 * changed things. Don't bother checking the hash because
+		 * we're about to compare the whole name anyway.
 		 */
 		if (dentry->d_parent != parent)
 			goto next;
@@ -1466,33 +1491,26 @@ out:
  * This is used by ncpfs in its readdir implementation.
  * Zero is returned in the dentry is invalid.
  */
- 
-int d_validate(struct dentry *dentry, struct dentry *dparent)
+int d_validate(struct dentry *dentry, struct dentry *parent)
 {
-	struct hlist_head *base;
-	struct hlist_node *lhp;
+	struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+	struct hlist_node *node;
+	struct dentry *d;
 
 	/* Check whether the ptr might be valid at all.. */
 	if (!kmem_ptr_validate(dentry_cache, dentry))
-		goto out;
-
-	if (dentry->d_parent != dparent)
-		goto out;
+		return 0;
+	if (dentry->d_parent != parent)
+		return 0;
 
-	spin_lock(&dcache_lock);
-	base = d_hash(dparent, dentry->d_name.hash);
-	hlist_for_each(lhp,base) { 
-		/* hlist_for_each_entry_rcu() not required for d_hash list
-		 * as it is parsed under dcache_lock
-		 */
-		if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
-			spin_unlock(&dcache_lock);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, node, head, d_hash) {
+		if (d == dentry) {
+			dget(dentry);
 			return 1;
 		}
 	}
-	spin_unlock(&dcache_lock);
-out:
+	rcu_read_unlock();
 	return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1903,48 +1921,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 
 /**
- * __d_path - return the path of a dentry
+ * Prepend path string to a buffer
+ *
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry (may be modified by this function)
- * @buffer: buffer to return value in
- * @buflen: buffer length
+ * @buffer: pointer to the end of the buffer
+ * @buflen: pointer to buffer length
  *
- * Convert a dentry into an ASCII path name. If the entry has been deleted
- * the string " (deleted)" is appended. Note that this is ambiguous.
- *
- * Returns a pointer into the buffer or an error code if the
- * path was too long.
- *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * Caller holds the dcache_lock.
  *
  * If path is not reachable from the supplied root, then the value of
  * root is changed (without modifying refcounts).
  */
-char *__d_path(const struct path *path, struct path *root,
-	       char *buffer, int buflen)
+static int prepend_path(const struct path *path, struct path *root,
+			char **buffer, int *buflen)
 {
 	struct dentry *dentry = path->dentry;
 	struct vfsmount *vfsmnt = path->mnt;
-	char *end = buffer + buflen;
-	char *retval;
-
-	spin_lock(&vfsmount_lock);
-	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
-		(prepend(&end, &buflen, " (deleted)", 10) != 0))
-			goto Elong;
-
-	if (buflen < 1)
-		goto Elong;
-	/* Get '/' right */
-	retval = end-1;
-	*retval = '/';
+	bool slash = false;
+	int error = 0;
 
-	for (;;) {
+	br_read_lock(vfsmount_lock);
+	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
-		if (dentry == root->dentry && vfsmnt == root->mnt)
-			break;
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
 			/* Global root? */
 			if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1956,28 +1956,90 @@ char *__d_path(const struct path *path, struct path *root,
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
-		    (prepend(&end, &buflen, "/", 1) != 0))
-			goto Elong;
-		retval = end;
+		error = prepend_name(buffer, buflen, &dentry->d_name);
+		if (!error)
+			error = prepend(buffer, buflen, "/", 1);
+		if (error)
+			break;
+
+		slash = true;
 		dentry = parent;
 	}
 
 out:
-	spin_unlock(&vfsmount_lock);
-	return retval;
+	if (!error && !slash)
+		error = prepend(buffer, buflen, "/", 1);
+
+	br_read_unlock(vfsmount_lock);
+	return error;
 
 global_root:
-	retval += 1;	/* hit the slash */
-	if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
-		goto Elong;
+	/*
+	 * Filesystems needing to implement special "root names"
+	 * should do so with ->d_dname()
+	 */
+	if (IS_ROOT(dentry) &&
+	    (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
+		WARN(1, "Root dentry has weird name <%.*s>\n",
+		     (int) dentry->d_name.len, dentry->d_name.name);
+	}
 	root->mnt = vfsmnt;
 	root->dentry = dentry;
 	goto out;
+}
 
-Elong:
-	retval = ERR_PTR(-ENAMETOOLONG);
-	goto out;
+/**
+ * __d_path - return the path of a dentry
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry (may be modified by this function)
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive.
+ *
+ * If path is not reachable from the supplied root, then the value of
+ * root is changed (without modifying refcounts).
+ */
+char *__d_path(const struct path *path, struct path *root,
+	       char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	int error;
+
+	prepend(&res, &buflen, "\0", 1);
+	spin_lock(&dcache_lock);
+	error = prepend_path(path, root, &res, &buflen);
+	spin_unlock(&dcache_lock);
+
+	if (error)
+		return ERR_PTR(error);
+	return res;
+}
+
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path, struct path *root,
+				 char **buf, int *buflen)
+{
+	prepend(buf, buflen, "\0", 1);
+	if (d_unlinked(path->dentry)) {
+		int error = prepend(buf, buflen, " (deleted)", 10);
+		if (error)
+			return error;
+	}
+
+	return prepend_path(path, root, buf, buflen);
+}
+
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+	return prepend(buffer, buflen, "(unreachable)", 13);
 }
 
 /**
@@ -1998,9 +2060,10 @@ Elong:
  */
 char *d_path(const struct path *path, char *buf, int buflen)
 {
-	char *res;
+	char *res = buf + buflen;
 	struct path root;
 	struct path tmp;
+	int error;
 
 	/*
 	 * We have various synthetic filesystems that never get mounted.  On
@@ -2012,19 +2075,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	if (path->dentry->d_op && path->dentry->d_op->d_dname)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root(current->fs, &root);
 	spin_lock(&dcache_lock);
 	tmp = root;
-	res = __d_path(path, &tmp, buf, buflen);
+	error = path_with_deleted(path, &tmp, &res, &buflen);
+	if (error)
+		res = ERR_PTR(error);
 	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
 }
 EXPORT_SYMBOL(d_path);
 
+/**
+ * d_path_with_unreachable - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * The difference from d_path() is that this prepends "(unreachable)"
+ * to paths which are unreachable from the current process' root.
+ */
+char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	struct path tmp;
+	int error;
+
+	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+	get_fs_root(current->fs, &root);
+	spin_lock(&dcache_lock);
+	tmp = root;
+	error = path_with_deleted(path, &tmp, &res, &buflen);
+	if (!error && !path_equal(&tmp, &root))
+		error = prepend_unreachable(&res, &buflen);
+	spin_unlock(&dcache_lock);
+	path_put(&root);
+	if (error)
+		res =  ERR_PTR(error);
+
+	return res;
+}
+
 /*
  * Helper function for dentry_operations.d_dname() members
  */
@@ -2049,16 +2144,12 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
 	char *end = buf + buflen;
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
-		(prepend(&end, &buflen, "//deleted", 9) != 0))
-			goto Elong;
 	if (buflen < 1)
 		goto Elong;
 	/* Get '/' right */
@@ -2076,7 +2167,28 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 		retval = end;
 		dentry = parent;
 	}
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL(__dentry_path);
+
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *p = NULL;
+	char *retval;
+
+	spin_lock(&dcache_lock);
+	if (d_unlinked(dentry)) {
+		p = buf + buflen;
+		if (prepend(&p, &buflen, "//deleted", 10) != 0)
+			goto Elong;
+		buflen++;
+	}
+	retval = __dentry_path(dentry, buf, buflen);
 	spin_unlock(&dcache_lock);
+	if (!IS_ERR(retval) && p)
+		*p = '/';	/* restore '/' overriden with '\0' */
 	return retval;
 Elong:
 	spin_unlock(&dcache_lock);
@@ -2110,27 +2222,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	if (!page)
 		return -ENOMEM;
 
-	read_lock(&current->fs->lock);
-	pwd = current->fs->pwd;
-	path_get(&pwd);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root_and_pwd(current->fs, &root, &pwd);
 
 	error = -ENOENT;
 	spin_lock(&dcache_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
-		char * cwd;
+		char *cwd = page + PAGE_SIZE;
+		int buflen = PAGE_SIZE;
 
-		cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+		prepend(&cwd, &buflen, "\0", 1);
+		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
 		spin_unlock(&dcache_lock);
 
-		error = PTR_ERR(cwd);
-		if (IS_ERR(cwd))
+		if (error)
 			goto out;
 
+		/* Unreachable from current root */
+		if (!path_equal(&tmp, &root)) {
+			error = prepend_unreachable(&cwd, &buflen);
+			if (error)
+				goto out;
+		}
+
 		error = -ERANGE;
 		len = PAGE_SIZE + page - cwd;
 		if (len <= size) {
@@ -2195,11 +2310,12 @@ int path_is_under(struct path *path1, struct path *path2)
 	struct vfsmount *mnt = path1->mnt;
 	struct dentry *dentry = path1->dentry;
 	int res;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	if (mnt != path2->mnt) {
 		for (;;) {
 			if (mnt->mnt_parent == mnt) {
-				spin_unlock(&vfsmount_lock);
+				br_read_unlock(vfsmount_lock);
 				return 0;
 			}
 			if (mnt->mnt_parent == path2->mnt)
@@ -2209,7 +2325,7 @@ int path_is_under(struct path *path1, struct path *path2)
 		dentry = mnt->mnt_mountpoint;
 	}
 	res = is_subdir(dentry, path2->dentry);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2311,6 +2427,9 @@ static void __init dcache_init(void)
 {
 	int loop;
 
+	percpu_counter_init(&nr_dentry, 0);
+	percpu_counter_init(&nr_dentry_unused, 0);
+
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b..89d394d8fe2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		default_open,
+	.llseek =	noop_llseek,
 };
 
 static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
 	.read =		read_file_bool,
 	.write =	write_file_bool,
 	.open =		default_open,
+	.llseek =	default_llseek,
 };
 
 /**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 static const struct file_operations fops_blob = {
 	.read =		read_file_blob,
 	.open =		default_open,
+	.llseek =	default_llseek,
 };
 
 /**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbca..37a8ca7c122 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static int debug_get_sb(struct file_system_type *fs_type,
+static struct dentry *debug_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
+	return mount_single(fs_type, flags, data, debug_fill_super);
 }
 
 static struct file_system_type debug_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"debugfs",
-	.get_sb =	debug_get_sb,
+	.mount =	debug_mount,
 	.kill_sb =	kill_litter_super,
 };
 
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b523..1bb547c9cad 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 
 /*
- * devpts_get_sb()
+ * devpts_mount()
  *
  *     If the '-o newinstance' mount option was specified, mount a new
  *     (private) instance of devpts.  PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
  *     semantics in devpts while preserving backward compatibility of the
  *     current 'single-namespace' semantics. i.e all mounts of devpts
  *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like get_sb_single().
+ *     kernel mount, like mount_single().
  *
  *     Mounts with 'newinstance' option create a new, private namespace.
  *
  *     NOTE:
  *
- *     For single-mount semantics, devpts cannot use get_sb_single(),
- *     because get_sb_single()/sget() find and use the super-block from
+ *     For single-mount semantics, devpts cannot use mount_single(),
+ *     because mount_single()/sget() find and use the super-block from
  *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     'newinstance' mount and mount_single() would pick the newinstance
  *     super-block instead of the initial super-block.
  */
-static int devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	int error;
 	struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 
 	error = parse_mount_options(data, PARSE_MOUNT, &opts);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	if (opts.newinstance)
 		s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	if (!s->s_root) {
 		s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 	if (error)
 		goto out_undo_sget;
 
-	simple_set_mnt(mnt, s);
-
-	return 0;
+	return dget(s->s_root);
 
 out_undo_sget:
 	deactivate_locked_super(s);
-	return error;
+	return ERR_PTR(error);
 }
 
 #else
@@ -404,10 +402,10 @@ out_undo_sget:
  * This supports only the legacy single-instance semantics (no
  * multiple-instance semantics)
  */
-static int devpts_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+	return mount_single(fs_type, flags, data, devpts_fill_super);
 }
 #endif
 
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
 
 static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
-	.get_sb		= devpts_get_sb,
+	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
 };
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531..85882f6ba5f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
 	ssize_t transferred = 0;
 
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 			transferred = dio->i_size - offset;
 	}
 
-	if (dio->end_io && dio->result)
-		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private);
-
-	if (dio->flags & DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
-
 	if (ret == 0)
 		ret = dio->page_errors;
 	if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (ret == 0)
 		ret = transferred;
 
+	if (dio->end_io && dio->result) {
+		dio->end_io(dio->iocb, offset, transferred,
+			    dio->map_bh.b_private, ret, is_async);
+	} else if (is_async) {
+		aio_complete(dio->iocb, ret, 0);
+	}
+
+	if (dio->flags & DIO_LOCKING)
+		/* lockdep: non-owner release */
+		up_read_non_owner(&dio->inode->i_alloc_sem);
+
 	return ret;
 }
 
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
-		aio_complete(dio->iocb, ret, 0);
+		dio_complete(dio, dio->iocb->ki_pos, 0, true);
 		kfree(dio);
 	}
 }
@@ -632,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio)
 	int ret = 0;
 
 	if (dio->bio) {
-		loff_t cur_offset = dio->block_in_file << dio->blkbits;
+		loff_t cur_offset = dio->cur_page_fs_offset;
 		loff_t bio_next_offset = dio->logical_offset_in_bio +
 			dio->bio->bi_size;
 
@@ -657,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio)
 		 * Submit now if the underlying fs is about to perform a
 		 * metadata read
 		 */
-		if (dio->boundary)
+		else if (dio->boundary)
 			dio_bio_submit(dio);
 	}
 
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret);
+		ret = dio_complete(dio, offset, ret, false);
 		kfree(dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
@@ -1134,8 +1136,27 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
 ssize_t
-__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1231,57 +1252,4 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
-EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
-
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io,	int flags)
-{
-	ssize_t retval;
-
-	retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
-			offset, nr_segs, get_block, end_io, submit_io, flags);
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
-	 * their own manner. This is a further example of where the old
-	 * truncate sequence is inadequate.
-	 *
-	 * NOTE: filesystems with their own locking have to handle this
-	 * on their own.
-	 */
-	if (flags & DIO_LOCKING) {
-		if (unlikely((rw & WRITE) && retval < 0)) {
-			loff_t isize = i_size_read(inode);
-			loff_t end = offset + iov_length(iov, nr_segs);
-
-			if (end > isize)
-				vmtruncate(inode, isize);
-		}
-	}
-
-	return retval;
-}
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf2515874..6b42ba807df 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
 static const struct file_operations waiters_fops = {
 	.owner   = THIS_MODULE,
 	.open    = waiters_open,
-	.read    = waiters_read
+	.read    = waiters_read,
+	.llseek  = default_llseek,
 };
 
 void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15c..64e5f3efdd8 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
 	struct dlm_lkb *gr;
 
 	list_for_each_entry(gr, head, lkb_statequeue) {
+		/* skip self when sending basts to convertqueue */
+		if (gr == lkb)
+			continue;
 		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
 			queue_bast(r, gr, lkb->lkb_rqmode);
 			gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c62052..37a34c2c622 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
 
 	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
 		hlist_for_each_entry(con, h, &connection_hash[i], list) {
-			if (con && con->sctp_assoc == assoc_id) {
+			if (con->sctp_assoc == assoc_id) {
 				mutex_unlock(&connections_lock);
 				return con;
 			}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100..ef17e0169da 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
 
 int __init dlm_netlink_init(void)
 {
-	int rv;
-
-	rv = genl_register_family(&family);
-	if (rv)
-		return rv;
-
-	rv = genl_register_ops(&family, &dlm_nl_ops);
-	if (rv < 0)
-		goto err;
-	return 0;
- err:
-	genl_unregister_family(&family);
-	return rv;
+	return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
 }
 
 void dlm_netlink_exit(void)
 {
-	genl_unregister_ops(&family, &dlm_nl_ops);
 	genl_unregister_family(&family);
 }
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db694..30d8b85febb 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
 	.read    = dev_read,
 	.write   = dev_write,
 	.poll    = dev_poll,
-	.owner   = THIS_MODULE
+	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130..66d6c16bf44 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
 	.write   = device_write,
 	.poll    = device_poll,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
 	.read    = device_read,
 	.write   = device_write,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
 	.open    = monitor_device_open,
 	.release = monitor_device_close,
 	.owner   = THIS_MODULE,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice monitor_device = {
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 83c4f600786..2195c213ab2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		if (inode->i_mapping->nrpages == 0)
 			continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5..cbadc1bee6e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
 
 /**
  * ecryptfs_init_crypt_ctx
- * @crypt_stat: Uninitilized crypt stats structure
+ * @crypt_stat: Uninitialized crypt stats structure
  *
  * Initialize the crypto context.
  *
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
 struct mutex key_tfm_list_mutex;
 
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
 {
 	mutex_init(&key_tfm_list_mutex);
 	INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
 				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
 				 + encoded_name_no_prefix_size);
 			(*encoded_name)[(*encoded_name_size)] = '\0';
-			(*encoded_name_size)++;
 		} else {
 			rc = -EOPNOTSUPP;
 		}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a..413a3c48f0b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
 	u32 flags;
 	struct list_head global_auth_tok_list;
 	struct mutex global_auth_tok_list_mutex;
@@ -477,7 +478,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
-	return (struct ecryptfs_file_info *)file->private_data;
+	return file->private_data;
 }
 
 static inline void
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e8fcf4e2ed7..91da02987bf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -199,7 +198,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 			       "the persistent file for the dentry with name "
 			       "[%s]; rc = [%d]\n", __func__,
 			       ecryptfs_dentry->d_name.name, rc);
-			goto out;
+			goto out_free;
 		}
 	}
 	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
@@ -207,7 +206,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		rc = -EPERM;
 		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
 		       "file must hence be opened RO\n", __func__);
-		goto out;
+		goto out_free;
 	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -284,26 +283,53 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	int rc = 0;
 	struct file *lower_file = NULL;
 
-	lock_kernel();
 	lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op && lower_file->f_op->fasync)
 		rc = lower_file->f_op->fasync(fd, lower_file, flag);
-	unlock_kernel();
 	return rc;
 }
 
-static int ecryptfs_ioctl(struct inode *inode, struct file *file,
-			  unsigned int cmd, unsigned long arg);
+static long
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOTTY;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOIOCTLCMD;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+#endif
 
 const struct file_operations ecryptfs_dir_fops = {
 	.readdir = ecryptfs_readdir,
-	.ioctl = ecryptfs_ioctl,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
 	.release = ecryptfs_release,
 	.fsync = ecryptfs_fsync,
 	.fasync = ecryptfs_fasync,
 	.splice_read = generic_file_splice_read,
+	.llseek = default_llseek,
 };
 
 const struct file_operations ecryptfs_main_fops = {
@@ -313,7 +339,10 @@ const struct file_operations ecryptfs_main_fops = {
 	.write = do_sync_write,
 	.aio_write = generic_file_aio_write,
 	.readdir = ecryptfs_readdir,
-	.ioctl = ecryptfs_ioctl,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
 	.mmap = generic_file_mmap,
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
@@ -322,20 +351,3 @@ const struct file_operations ecryptfs_main_fops = {
 	.fasync = ecryptfs_fasync,
 	.splice_read = generic_file_splice_read,
 };
-
-static int
-ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-	       unsigned long arg)
-{
-	int rc = 0;
-	struct file *lower_file = NULL;
-
-	if (ecryptfs_file_to_private(file))
-		lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
-		rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
-					     lower_file, cmd, arg);
-	else
-		rc = -ENOTTY;
-	return rc;
-}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 31ef5252f0f..9d1a22d6276 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
 	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	struct dentry *dentry_save;
 	struct vfsmount *vfsmount_save;
+	unsigned int flags_save;
 	int rc;
 
 	dentry_save = nd->path.dentry;
 	vfsmount_save = nd->path.mnt;
+	flags_save = nd->flags;
 	nd->path.dentry = lower_dentry;
 	nd->path.mnt = lower_mnt;
+	nd->flags &= ~LOOKUP_OPEN;
 	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
 	nd->path.dentry = dentry_save;
 	nd->path.mnt = vfsmount_save;
+	nd->flags = flags_save;
 	return rc;
 }
 
@@ -264,7 +269,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		printk(KERN_ERR "%s: Out of memory whilst attempting "
 		       "to allocate ecryptfs_dentry_info struct\n",
 			__func__);
-		goto out_dput;
+		goto out_put;
 	}
 	ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
 	ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -339,14 +344,84 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 out_free_kmem:
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
 	goto out;
-out_dput:
+out_put:
 	dput(lower_dentry);
+	mntput(lower_mnt);
 	d_drop(ecryptfs_dentry);
 out:
 	return rc;
 }
 
 /**
+ * ecryptfs_new_lower_dentry
+ * @name: The name of the new dentry.
+ * @lower_dir_dentry: Parent directory of the new dentry.
+ * @nd: nameidata from last lookup.
+ *
+ * Create a new dentry or get it from lower parent dir.
+ */
+static struct dentry *
+ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
+			  struct nameidata *nd)
+{
+	struct dentry *new_dentry;
+	struct dentry *tmp;
+	struct inode *lower_dir_inode;
+
+	lower_dir_inode = lower_dir_dentry->d_inode;
+
+	tmp = d_alloc(lower_dir_dentry, name);
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&lower_dir_inode->i_mutex);
+	new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
+	mutex_unlock(&lower_dir_inode->i_mutex);
+
+	if (!new_dentry)
+		new_dentry = tmp;
+	else
+		dput(tmp);
+
+	return new_dentry;
+}
+
+
+/**
+ * ecryptfs_lookup_one_lower
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @lower_dir_dentry: lower parent directory
+ * @name: lower file name
+ *
+ * Get the lower dentry from vfs. If lower dentry does not exist yet,
+ * create it.
+ */
+static struct dentry *
+ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
+			  struct dentry *lower_dir_dentry, struct qstr *name)
+{
+	struct nameidata nd;
+	struct vfsmount *lower_mnt;
+	int err;
+
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
+				    ecryptfs_dentry->d_parent));
+	err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
+	mntput(lower_mnt);
+
+	if (!err) {
+		/* we dont need the mount */
+		mntput(nd.path.mnt);
+		return nd.path.dentry;
+	}
+	if (err != -ENOENT)
+		return ERR_PTR(err);
+
+	/* create a new lower dentry */
+	return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
+}
+
+/**
  * ecryptfs_lookup
  * @ecryptfs_dir_inode: The eCryptfs directory inode
  * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -363,6 +438,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	size_t encrypted_and_encoded_name_size;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 	struct dentry *lower_dir_dentry, *lower_dentry;
+	struct qstr lower_name;
 	int rc = 0;
 
 	ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -373,14 +449,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
-				      lower_dir_dentry,
-				      ecryptfs_dentry->d_name.len);
-	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_name.name = ecryptfs_dentry->d_name.name;
+	lower_name.len = ecryptfs_dentry->d_name.len;
+	lower_name.hash = ecryptfs_dentry->d_name.hash;
+	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+						    &lower_name);
+		if (rc < 0)
+			goto out_d_drop;
+	}
+	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
+						 lower_dir_dentry, &lower_name);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
@@ -402,14 +484,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
-	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
-				      lower_dir_dentry,
-				      encrypted_and_encoded_name_size - 1);
-	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_name.name = encrypted_and_encoded_name;
+	lower_name.len = encrypted_and_encoded_name_size;
+	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+						    &lower_name);
+		if (rc < 0)
+			goto out_d_drop;
+	}
+	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
+						 lower_dir_dentry, &lower_name);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
@@ -804,10 +892,20 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (ia->ia_size & ~PAGE_CACHE_MASK));
 
+
+		/*
+		 * XXX(truncate) this should really happen at the begginning
+		 * of ->setattr.  But the code is too messy to that as part
+		 * of a larger patch.  ecryptfs is also totally missing out
+		 * on the inode_change_ok check at the beginning of
+		 * ->setattr while would include this.
+		 */
+		rc = inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-			rc = simple_setsize(inode, ia->ia_size);
-			if (rc)
-				goto out;
+			truncate_setsize(inode, ia->ia_size);
 			lower_ia->ia_size = ia->ia_size;
 			lower_ia->ia_valid |= ATTR_SIZE;
 			goto out;
@@ -830,7 +928,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 				goto out;
 			}
 		}
-		simple_setsize(inode, ia->ia_size);
+		truncate_setsize(inode, ia->ia_size);
 		rc = ecryptfs_write_inode_size_to_metadata(inode);
 		if (rc) {
 			printk(KERN_ERR	"Problem with "
@@ -1015,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&lower_dentry->d_inode->i_mutex);
-	rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
-						   size, flags);
-	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+
+	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506e..b1f6858a522 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
  */
 static int
 ecryptfs_find_auth_tok_for_sig(
+	struct key **auth_tok_key,
 	struct ecryptfs_auth_tok **auth_tok,
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 	char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
 	struct ecryptfs_global_auth_tok *global_auth_tok;
 	int rc = 0;
 
+	(*auth_tok_key) = NULL;
 	(*auth_tok) = NULL;
 	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
 						  mount_crypt_stat, sig)) {
-		struct key *auth_tok_key;
 
-		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+		/* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+		 * mount_crypt_stat structure, we prevent to use auth toks that
+		 * are not inserted through the ecryptfs_add_global_auth_tok
+		 * function.
+		 */
+		if (mount_crypt_stat->flags
+				& ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+			return -EINVAL;
+
+		rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
 						       sig);
 	} else
 		(*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,12 +519,14 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 			     char *filename, size_t filename_size)
 {
 	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -605,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	}
 	dest[s->i++] = s->cipher_code;
 	rc = ecryptfs_find_auth_tok_for_sig(
+		&auth_tok_key,
 		&s->auth_tok, mount_crypt_stat,
 		mount_crypt_stat->global_default_fnek_sig);
 	if (rc) {
@@ -752,6 +765,8 @@ out_free_unlock:
 out_unlock:
 	mutex_unlock(s->tfm_mutex);
 out:
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	kfree(s);
 	return rc;
 }
@@ -797,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 			     char *data, size_t max_packet_size)
 {
 	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	(*packet_size) = 0;
@@ -806,6 +822,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -908,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	 * >= ECRYPTFS_MAX_IV_BYTES. */
 	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
 	s->desc.info = s->iv;
-	rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+	rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					    &s->auth_tok, mount_crypt_stat,
 					    s->fnek_sig_hex);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -984,6 +1002,8 @@ out:
 		(*filename_size) = 0;
 		(*filename) = NULL;
 	}
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	kfree(s);
 	return rc;
 }
@@ -1555,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 		       ECRYPTFS_VERSION_MAJOR,
 		       ECRYPTFS_VERSION_MINOR);
 		rc = -EINVAL;
-		goto out;
+		goto out_release_key;
 	}
 	if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
 	    && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
 		printk(KERN_ERR "Invalid auth_tok structure "
 		       "returned from key query\n");
 		rc = -EINVAL;
-		goto out;
+		goto out_release_key;
+	}
+out_release_key:
+	if (rc) {
+		key_put(*auth_tok_key);
+		(*auth_tok_key) = NULL;
 	}
 out:
 	return rc;
@@ -1686,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
 	size_t tag_11_contents_size;
 	size_t tag_11_packet_size;
+	struct key *auth_tok_key = NULL;
 	int rc = 0;
 
 	INIT_LIST_HEAD(&auth_tok_list);
@@ -1782,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 	 * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
 	found_auth_tok = 0;
+	if (auth_tok_key) {
+		key_put(auth_tok_key);
+		auth_tok_key = NULL;
+	}
 	list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
 		candidate_auth_tok = &auth_tok_list_item->auth_tok;
 		if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1798,10 +1828,11 @@ find_next_matching_auth_tok:
 			rc = -EINVAL;
 			goto out_wipe_list;
 		}
-		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+		rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+					       &matching_auth_tok,
 					       crypt_stat->mount_crypt_stat,
 					       candidate_auth_tok_sig);
-		if (matching_auth_tok) {
+		if (!rc) {
 			found_auth_tok = 1;
 			goto found_matching_auth_tok;
 		}
@@ -1864,6 +1895,8 @@ found_matching_auth_tok:
 out_wipe_list:
 	wipe_auth_tok_list(&auth_tok_list);
 out:
+	if (auth_tok_key)
+		key_put(auth_tok_key);
 	return rc;
 }
 
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aaf..0851ab6980f 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
 	return 0;
 }
 
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
 {
 	int rc = 0;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb2..a9dbd62518e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
        ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
 	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
 	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+	{ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 		case ecryptfs_opt_unlink_sigs:
 			mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
 			break;
+		case ecryptfs_opt_mount_auth_tok_only:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+			break;
 		case ecryptfs_opt_err:
 		default:
 			printk(KERN_WARNING
@@ -540,9 +546,8 @@ out:
  *                        ecryptfs_interpose to perform most of the linking
  * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
  */
-static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *raw_data,
-			struct vfsmount *mnt)
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data)
 {
 	struct super_block *s;
 	struct ecryptfs_sb_info *sbi;
@@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 		err = "Reading sb failed";
 		goto out;
 	}
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 out:
 	if (sbi) {
@@ -616,7 +620,7 @@ out:
 		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
 	}
 	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-	return rc;
+	return ERR_PTR(rc);
 }
 
 /**
@@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
 static struct file_system_type ecryptfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "ecryptfs",
-	.get_sb = ecryptfs_get_sb,
+	.mount = ecryptfs_mount,
 	.kill_sb = ecryptfs_kill_block_super,
 	.fs_flags = 0
 };
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 46c4dd8dfcc..ab224809051 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -274,7 +274,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 			      struct user_namespace *user_ns, struct pid *pid,
 			      u32 seq)
 {
-	struct ecryptfs_daemon *daemon;
+	struct ecryptfs_daemon *uninitialized_var(daemon);
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t msg_size;
 	struct nsproxy *nsproxy;
@@ -473,7 +473,7 @@ sleep:
 	return rc;
 }
 
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
 {
 	int i;
 	int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd..940a82e63dc 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
 	.read    = ecryptfs_miscdev_read,
 	.write   = ecryptfs_miscdev_write,
 	.release = ecryptfs_miscdev_release,
+	.llseek  = noop_llseek,
 };
 
 static struct miscdevice ecryptfs_miscdev = {
@@ -500,7 +501,7 @@ static struct miscdevice ecryptfs_miscdev = {
  *
  * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
 {
 	int rc;
 
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0435886e4a9..253732382d3 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -118,11 +118,15 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
  */
 static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	if (!lower_dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+	return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
 }
 
 /**
- * ecryptfs_clear_inode
+ * ecryptfs_evict_inode
  * @inode - The ecryptfs inode
  *
  * Called by iput() when the inode reference count reached zero
@@ -131,8 +135,10 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  * on the inode free list. We use this to drop out reference to the
  * lower inode.
  */
-static void ecryptfs_clear_inode(struct inode *inode)
+static void ecryptfs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	iput(ecryptfs_inode_to_lower(inode));
 }
 
@@ -174,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, ",ecryptfs_encrypted_view");
 	if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
 		seq_printf(m, ",ecryptfs_unlink_sigs");
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+		seq_printf(m, ",ecryptfs_mount_auth_tok_only");
 
 	return 0;
 }
@@ -184,6 +192,6 @@ const struct super_operations ecryptfs_sops = {
 	.drop_inode = generic_delete_inode,
 	.statfs = ecryptfs_statfs,
 	.remount_fs = NULL,
-	.clear_inode = ecryptfs_clear_inode,
+	.evict_inode = ecryptfs_evict_inode,
 	.show_options = ecryptfs_show_options
 };
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f0494281081..5073a07652c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static int efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *efs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 
 static struct file_system_type efs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "efs",
-	.get_sb		= efs_get_sb,
+	.mount		= efs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf8..e0194b3e14d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
 	.poll		= eventfd_poll,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
+	.llseek		= noop_llseek,
 };
 
 /**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919c..8cf07242067 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
 
-/* Maximum msec timeout value storeable in a long int */
-#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
-
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 
 #define EP_UNACTIVE_PTR ((void *) -1L)
@@ -674,7 +671,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
-	.poll		= ep_eventpoll_poll
+	.poll		= ep_eventpoll_poll,
+	.llseek		= noop_llseek,
 };
 
 /* Fast test to see if the file is an evenpoll file */
@@ -1116,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
-	int res, eavail;
+	int res, eavail, timed_out = 0;
 	unsigned long flags;
-	long jtimeout;
+	long slack;
 	wait_queue_t wait;
-
-	/*
-	 * Calculate the timeout by checking for the "infinite" value (-1)
-	 * and the overflow condition. The passed timeout is in milliseconds,
-	 * that why (t * HZ) / 1000.
-	 */
-	jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
-		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
+	struct timespec end_time;
+	ktime_t expires, *to = NULL;
+
+	if (timeout > 0) {
+		ktime_get_ts(&end_time);
+		timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
+		slack = select_estimate_accuracy(&end_time);
+		to = &expires;
+		*to = timespec_to_ktime(end_time);
+	} else if (timeout == 0) {
+		timed_out = 1;
+	}
 
 retry:
 	spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1151,7 @@ retry:
 			 * to TASK_INTERRUPTIBLE before doing the checks.
 			 */
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (!list_empty(&ep->rdllist) || !jtimeout)
+			if (!list_empty(&ep->rdllist) || timed_out)
 				break;
 			if (signal_pending(current)) {
 				res = -EINTR;
@@ -1157,7 +1159,9 @@ retry:
 			}
 
 			spin_unlock_irqrestore(&ep->lock, flags);
-			jtimeout = schedule_timeout(jtimeout);
+			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+				timed_out = 1;
+
 			spin_lock_irqsave(&ep->lock, flags);
 		}
 		__remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1179,7 @@ retry:
 	 * more luck.
 	 */
 	if (!res && eavail &&
-	    !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
+	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto retry;
 
 	return res;
diff --git a/fs/exec.c b/fs/exec.c
index e19de6a8033..99d33a1371e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -55,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -66,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
+struct core_name {
+	char *corename;
+	int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
 static LIST_HEAD(formats);
@@ -129,7 +135,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	fsnotify_open(file->f_path.dentry);
+	fsnotify_open(file);
 
 	error = -ENOEXEC;
 	if(file->f_op) {
@@ -362,13 +368,13 @@ err:
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(char __user * __user * argv, int max)
+static int count(const char __user * const __user * argv, int max)
 {
 	int i = 0;
 
 	if (argv != NULL) {
 		for (;;) {
-			char __user * p;
+			const char __user * p;
 
 			if (get_user(p, argv))
 				return -EFAULT;
@@ -377,6 +383,9 @@ static int count(char __user * __user * argv, int max)
 			argv++;
 			if (i++ >= max)
 				return -E2BIG;
+
+			if (fatal_signal_pending(current))
+				return -ERESTARTNOHAND;
 			cond_resched();
 		}
 	}
@@ -388,7 +397,7 @@ static int count(char __user * __user * argv, int max)
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -397,7 +406,7 @@ static int copy_strings(int argc, char __user * __user * argv,
 	int ret;
 
 	while (argc-- > 0) {
-		char __user *str;
+		const char __user *str;
 		int len;
 		unsigned long pos;
 
@@ -420,6 +429,12 @@ static int copy_strings(int argc, char __user * __user * argv,
 		while (len > 0) {
 			int offset, bytes_to_copy;
 
+			if (fatal_signal_pending(current)) {
+				ret = -ERESTARTNOHAND;
+				goto out;
+			}
+			cond_resched();
+
 			offset = pos % PAGE_SIZE;
 			if (offset == 0)
 				offset = PAGE_SIZE;
@@ -471,12 +486,13 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *argv,
+			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (char __user * __user *)argv, bprm);
+	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
 	set_fs(oldfs);
 	return r;
 }
@@ -594,6 +610,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
 #else
 	stack_top = arch_align_stack(stack_top);
 	stack_top = PAGE_ALIGN(stack_top);
+
+	if (unlikely(stack_top < mmap_min_addr) ||
+	    unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+		return -ENOMEM;
+
 	stack_shift = vma->vm_end - stack_top;
 
 	bprm->p -= stack_shift;
@@ -653,6 +674,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	else
 		stack_base = vma->vm_start - stack_expand;
 #endif
+	current->mm->start_stack = bprm->p;
 	ret = expand_stack(vma, stack_base);
 	if (ret)
 		ret = -EFAULT;
@@ -683,7 +705,7 @@ struct file *open_exec(const char *name)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	fsnotify_open(file->f_path.dentry);
+	fsnotify_open(file);
 
 	err = deny_write_access(file);
 	if (err)
@@ -744,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		atomic_dec(&old_mm->oom_disable_count);
+		atomic_inc(&tsk->mm->oom_disable_count);
+	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
@@ -983,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	bprm->mm = NULL;		/* We're using it now */
 
-	current->flags &= ~PF_RANDOMIZE;
+	current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
@@ -997,7 +1023,7 @@ EXPORT_SYMBOL(flush_old_exec);
 void setup_new_exec(struct linux_binprm * bprm)
 {
 	int i, ch;
-	char * name;
+	const char *name;
 	char tcomm[sizeof(current->comm)];
 
 	arch_pick_mmap_layout(current->mm);
@@ -1063,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
  */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
 		return -ERESTARTNOINTR;
 
 	bprm->cred = prepare_exec_creds();
 	if (likely(bprm->cred))
 		return 0;
 
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 	return -ENOMEM;
 }
 
@@ -1078,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
-		mutex_unlock(&current->cred_guard_mutex);
+		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
 	kfree(bprm);
@@ -1099,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1117,7 +1143,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
-	write_lock(&p->fs->lock);
+	spin_lock(&p->fs->lock);
 	rcu_read_lock();
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1134,7 +1160,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 			res = 1;
 		}
 	}
-	write_unlock(&p->fs->lock);
+	spin_unlock(&p->fs->lock);
 
 	return res;
 }
@@ -1316,9 +1342,9 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(char * filename,
-	char __user *__user *argv,
-	char __user *__user *envp,
+int do_execve(const char * filename,
+	const char __user *const __user *argv,
+	const char __user *const __user *envp,
 	struct pt_regs * regs)
 {
 	struct linux_binprm *bprm;
@@ -1386,7 +1412,6 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
 	if (retval < 0)
 		goto out;
@@ -1439,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
 
 EXPORT_SYMBOL(set_binfmt);
 
+static int expand_corename(struct core_name *cn)
+{
+	char *old_corename = cn->corename;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+	if (!cn->corename) {
+		kfree(old_corename);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	char *cur;
+	int need;
+	int ret;
+	va_list arg;
+
+	va_start(arg, fmt);
+	need = vsnprintf(NULL, 0, fmt, arg);
+	va_end(arg);
+
+	if (likely(need < cn->size - cn->used - 1))
+		goto out_printf;
+
+	ret = expand_corename(cn);
+	if (ret)
+		goto expand_fail;
+
+out_printf:
+	cur = cn->corename + cn->used;
+	va_start(arg, fmt);
+	vsnprintf(cur, need + 1, fmt, arg);
+	va_end(arg);
+	cn->used += need;
+	return 0;
+
+expand_fail:
+	return ret;
+}
+
 /* format_corename will inspect the pattern parameter, and output a
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
 {
 	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
-	char *out_ptr = corename;
-	char *const out_end = corename + CORENAME_MAX_SIZE;
-	int rc;
 	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+	cn->corename = kmalloc(cn->size, GFP_KERNEL);
+	cn->used = 0;
+
+	if (!cn->corename)
+		return -ENOMEM;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
 		if (*pat_ptr != '%') {
-			if (out_ptr == out_end)
+			if (*pat_ptr == 0)
 				goto out;
-			*out_ptr++ = *pat_ptr++;
+			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
 			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
 			case 0:
 				goto out;
 			/* Double percent, output one percent */
 			case '%':
-				if (out_ptr == out_end)
-					goto out;
-				*out_ptr++ = '%';
+				err = cn_printf(cn, "%c", '%');
 				break;
 			/* pid */
 			case 'p':
 				pid_in_pattern = 1;
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", task_tgid_vnr(current));
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
 				break;
 			/* uid */
 			case 'u':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", cred->uid);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d", cred->uid);
 				break;
 			/* gid */
 			case 'g':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", cred->gid);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%d", cred->gid);
 				break;
 			/* signal that caused the coredump */
 			case 's':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%ld", signr);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%ld", signr);
 				break;
 			/* UNIX time of coredump */
 			case 't': {
 				struct timeval tv;
 				do_gettimeofday(&tv);
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%lu", tv.tv_sec);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%lu", tv.tv_sec);
 				break;
 			}
 			/* hostname */
 			case 'h':
 				down_read(&uts_sem);
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%s", utsname()->nodename);
+				err = cn_printf(cn, "%s",
+					      utsname()->nodename);
 				up_read(&uts_sem);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
 				break;
 			/* executable */
 			case 'e':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%s", current->comm);
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%s", current->comm);
 				break;
 			/* core limit size */
 			case 'c':
-				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%lu", rlimit(RLIMIT_CORE));
-				if (rc > out_end - out_ptr)
-					goto out;
-				out_ptr += rc;
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
 				break;
 			default:
 				break;
 			}
 			++pat_ptr;
 		}
+
+		if (err)
+			return err;
 	}
+
 	/* Backward compatibility with core_uses_pid:
 	 *
 	 * If core_pattern does not include a %p (as is the default)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
 	if (!ispipe && !pid_in_pattern && core_uses_pid) {
-		rc = snprintf(out_ptr, out_end - out_ptr,
-			      ".%d", task_tgid_vnr(current));
-		if (rc > out_end - out_ptr)
-			goto out;
-		out_ptr += rc;
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
 	}
 out:
-	*out_ptr = 0;
 	return ispipe;
 }
 
@@ -1836,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
-	char corename[CORENAME_MAX_SIZE + 1];
+	struct core_name cn;
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
 	const struct cred *old_cred;
@@ -1891,13 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	 */
 	clear_thread_flag(TIF_SIGPENDING);
 
-	/*
-	 * lock_kernel() because format_corename() is controlled by sysctl, which
-	 * uses lock_kernel()
-	 */
- 	lock_kernel();
-	ispipe = format_corename(corename, signr);
-	unlock_kernel();
+	ispipe = format_corename(&cn, signr);
+
+	if (ispipe == -ENOMEM) {
+		printk(KERN_WARNING "format_corename failed\n");
+		printk(KERN_WARNING "Aborting core\n");
+		goto fail_corename;
+	}
 
  	if (ispipe) {
 		int dump_count;
@@ -1934,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
+		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
@@ -1947,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 		argv_free(helper_argv);
 		if (retval) {
  			printk(KERN_INFO "Core dump to %s pipe failed\n",
-			       corename);
+			       cn.corename);
 			goto close_fail;
  		}
 	} else {
@@ -1956,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 		if (cprm.limit < binfmt->min_coredump)
 			goto fail_unlock;
 
-		cprm.file = filp_open(corename,
+		cprm.file = filp_open(cn.corename,
 				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
 				 0600);
 		if (IS_ERR(cprm.file))
@@ -1998,6 +2044,8 @@ fail_dropcount:
 	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
+	kfree(cn.corename);
+fail_corename:
 	coredump_finish(mm);
 	revert_creds(old_cred);
 fail_creds:
@@ -2005,3 +2053,43 @@ fail_creds:
 fail:
 	return;
 }
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+	int ret = 1;
+
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+			return 0;
+	} else {
+		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+		if (!buf)
+			return 0;
+		while (off > 0) {
+			unsigned long n = off;
+
+			if (n > PAGE_SIZE)
+				n = PAGE_SIZE;
+			if (!dump_write(file, buf, n)) {
+				ret = 0;
+				break;
+			}
+			off -= n;
+		}
+		free_page((unsigned long)buf);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc..dcc941d82d6 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
 	err = exofs_write_begin(NULL, page->mapping, pos, len,
 				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
 	if (err)
-		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
 			  err);
 
 	de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
 	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
 							&page, NULL);
 	if (err)
-		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
 			  err);
 	if (pde)
 		pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 22721b2fd89..2dc925fa101 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -256,7 +256,6 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
 }
 
 /* inode.c               */
-void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
@@ -264,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
 extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_delete_inode(struct inode *);
+extern void exofs_evict_inode(struct inode *);
 
 /* dir.c:                */
 int exofs_add_link(struct dentry *, struct inode *);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index fef6899be39..b905c79b4f0 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
  * along with exofs; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
-#include <linux/buffer_head.h>
-
 #include "exofs.h"
 
 static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,19 +37,23 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/* exofs_file_fsync - flush the inode to disk
+ *
+ *   Note, in exofs all metadata is written as part of inode, regardless.
+ *   The writeout is synchronous
+ */
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
-	struct address_space *mapping = filp->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = filp->f_mapping->host;
 	struct super_block *sb;
 
-	ret = filemap_write_and_wait(mapping);
-	if (ret)
-		return ret;
+	if (!(inode->i_state & I_DIRTY))
+		return 0;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return 0;
 
-	/* sync the inode attributes */
-	ret = write_inode_now(inode, 1);
+	ret = sync_inode_metadata(inode, 1);
 
 	/* This is a good place to write the sb */
 	/* TODO: Sechedule an sb-sync on create */
@@ -65,9 +66,9 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-	exofs_file_fsync(file, 1);
+	int ret = vfs_fsync(file, 0);
 	/* TODO: Flush the OSD target */
-	return 0;
+	return ret;
 }
 
 const struct file_operations exofs_file_operations = {
@@ -86,6 +87,5 @@ const struct file_operations exofs_file_operations = {
 };
 
 const struct inode_operations exofs_file_inode_operations = {
-	.truncate	= exofs_truncate,
 	.setattr	= exofs_setattr,
 };
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 4bb6ef822e4..42685424817 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
  */
 
 #include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/buffer_head.h>
-#include <scsi/scsi_device.h>
 
 #include "exofs.h"
 
@@ -57,6 +54,9 @@ struct page_collect {
 	unsigned nr_pages;
 	unsigned long length;
 	loff_t pg_first; /* keep 64bit also in 32-arches */
+	bool read_4_write; /* This means two things: that the read is sync
+			    * And the pages should not be unlocked.
+			    */
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -74,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
+	pcol->read_4_write = false;
 }
 
 static void _pcol_reset(struct page_collect *pcol)
@@ -184,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
  * status.
  */
-static int __readpages_done(struct page_collect *pcol, bool do_unlock)
+static int __readpages_done(struct page_collect *pcol)
 {
 	int i;
 	u64 resid;
@@ -220,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 			  page_stat ? "bad_bytes" : "good_bytes");
 
 		ret = update_read_page(page, page_stat);
-		if (do_unlock)
+		if (!pcol->read_4_write)
 			unlock_page(page);
 		length += PAGE_SIZE;
 	}
@@ -235,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
 
-	__readpages_done(pcol, true);
+	__readpages_done(pcol);
 	atomic_dec(&pcol->sbi->s_curr_pending);
 	kfree(pcol);
 }
@@ -256,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 	}
 }
 
-static int read_exec(struct page_collect *pcol, bool is_sync)
+static int read_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
 	struct exofs_io_state *ios = pcol->ios;
@@ -266,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	if (!pcol->pages)
 		return 0;
 
-	/* see comment in _readpage() about sync reads */
-	WARN_ON(is_sync && (pcol->nr_pages != 1));
-
 	ios->pages = pcol->pages;
 	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
-	if (is_sync) {
+	if (pcol->read_4_write) {
 		exofs_oi_read(oi, pcol->ios);
-		return __readpages_done(pcol, false);
+		return __readpages_done(pcol);
 	}
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -302,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	return 0;
 
 err:
-	if (!is_sync)
+	if (!pcol->read_4_write)
 		_unlock_pcol_pages(pcol, ret, READ);
 
 	pcol_free(pcol);
@@ -350,11 +348,12 @@ static int readpage_strip(void *data, struct page *page)
 		if (PageError(page))
 			ClearPageError(page);
 
-		unlock_page(page);
+		if (!pcol->read_4_write)
+			unlock_page(page);
 		EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
 			     " splitting\n", inode->i_ino, page->index);
 
-		return read_exec(pcol, false);
+		return read_exec(pcol);
 	}
 
 try_again:
@@ -364,7 +363,7 @@ try_again:
 	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
 		   page->index)) {
 		/* Discontinuity detected, split the request */
-		ret = read_exec(pcol, false);
+		ret = read_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
 		goto try_again;
@@ -389,7 +388,7 @@ try_again:
 			  page, len, pcol->nr_pages, pcol->length);
 
 		/* split the request, and start again with current page */
-		ret = read_exec(pcol, false);
+		ret = read_exec(pcol);
 		if (unlikely(ret))
 			goto fail;
 
@@ -418,26 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
-	return read_exec(&pcol, false);
+	return read_exec(&pcol);
 }
 
-static int _readpage(struct page *page, bool is_sync)
+static int _readpage(struct page *page, bool read_4_write)
 {
 	struct page_collect pcol;
 	int ret;
 
 	_pcol_init(&pcol, 1, page->mapping->host);
 
-	/* readpage_strip might call read_exec(,is_sync==false) at several
-	 * places but not if we have a single page.
-	 */
+	pcol.read_4_write = read_4_write;
 	ret = readpage_strip(&pcol, page);
 	if (ret) {
 		EXOFS_ERR("_readpage => %d\n", ret);
 		return ret;
 	}
 
-	return read_exec(&pcol, is_sync);
+	return read_exec(&pcol);
 }
 
 /*
@@ -508,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
 	if (!pcol_copy) {
-		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+		EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
 		ret = -ENOMEM;
 		goto err;
 	}
@@ -524,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
 
 	ret = exofs_oi_write(oi, ios);
 	if (unlikely(ret)) {
-		EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
+		EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
 		goto err;
 	}
 
@@ -625,7 +622,7 @@ try_again:
 		/* split the request, next loop will start again */
 		ret = write_exec(pcol);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("write_exec faild => %d", ret);
+			EXOFS_DBGMSG("write_exec failed => %d", ret);
 			goto fail;
 		}
 
@@ -697,6 +694,13 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 	return write_exec(&pcol);
 }
 
+/* i_mutex held using inode->i_size directly */
+static void _write_failed(struct inode *inode, loff_t to)
+{
+	if (to > inode->i_size)
+		truncate_pagecache(inode, to, inode->i_size);
+}
+
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
@@ -709,8 +713,8 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
 					 fsdata);
 		if (ret) {
-			EXOFS_DBGMSG("simple_write_begin faild\n");
-			return ret;
+			EXOFS_DBGMSG("simple_write_begin failed\n");
+			goto out;
 		}
 
 		page = *pagep;
@@ -722,9 +726,12 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		if (ret) {
 			/*SetPageError was done by _readpage. Is it ok?*/
 			unlock_page(page);
-			EXOFS_DBGMSG("__readpage_filler faild\n");
+			EXOFS_DBGMSG("__readpage_filler failed\n");
 		}
 	}
+out:
+	if (unlikely(ret))
+		_write_failed(mapping->host, pos + len);
 
 	return ret;
 }
@@ -750,6 +757,10 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
 	int ret;
 
 	ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+	if (unlikely(ret))
+		_write_failed(inode, pos + len);
+
+	/* TODO: once simple_write_end marks inode dirty remove */
 	if (i_size != inode->i_size)
 		mark_inode_dirty(inode);
 	return ret;
@@ -759,15 +770,13 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
 {
 	EXOFS_DBGMSG("page 0x%lx\n", page->index);
 	WARN_ON(1);
-	return try_to_free_buffers(page);
+	return 0;
 }
 
 static void exofs_invalidatepage(struct page *page, unsigned long offset)
 {
-	EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+	EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
 	WARN_ON(1);
-
-	block_invalidatepage(page, offset);
 }
 
 const struct address_space_operations exofs_aops = {
@@ -808,87 +817,55 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
 	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
 }
 
-/*
- * get_block_t - Fill in a buffer_head
- * An OSD takes care of block allocation so we just fake an allocation by
- * putting in the inode's sector_t in the buffer_head.
- * TODO: What about the case of create==0 and @iblock does not exist in the
- * object?
- */
-static int exofs_get_block(struct inode *inode, sector_t iblock,
-		    struct buffer_head *bh_result, int create)
-{
-	map_bh(bh_result, inode->i_sb, iblock);
-	return 0;
-}
-
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
 	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
 
-static int _do_truncate(struct inode *inode)
+static int _do_truncate(struct inode *inode, loff_t newsize)
 {
 	struct exofs_i_info *oi = exofs_i(inode);
-	loff_t isize = i_size_read(inode);
 	int ret;
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
-	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+	ret = exofs_oi_truncate(oi, (u64)newsize);
+	if (likely(!ret))
+		truncate_setsize(inode, newsize);
 
-	ret = exofs_oi_truncate(oi, (u64)isize);
-	EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+	EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+		     inode->i_ino, newsize, ret);
 	return ret;
 }
 
 /*
- * Truncate a file to the specified size - all we have to do is set the size
- * attribute.  We make sure the object exists first.
- */
-void exofs_truncate(struct inode *inode)
-{
-	struct exofs_i_info *oi = exofs_i(inode);
-	int ret;
-
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
-	     || S_ISLNK(inode->i_mode)))
-		return;
-	if (exofs_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
-
-	/* if we are about to truncate an object, and it hasn't been
-	 * created yet, wait
-	 */
-	if (unlikely(wait_obj_created(oi)))
-		goto fail;
-
-	ret = _do_truncate(inode);
-	if (ret)
-		goto fail;
-
-out:
-	mark_inode_dirty(inode);
-	return;
-fail:
-	make_bad_inode(inode);
-	goto out;
-}
-
-/*
- * Set inode attributes - just call generic functions.
+ * Set inode attributes - update size attribute on OSD if needed,
+ *                        otherwise just call generic functions.
  */
 int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	/* if we are about to modify an object, and it hasn't been
+	 * created yet, wait
+	 */
+	error = wait_obj_created(exofs_i(inode));
+	if (unlikely(error))
+		return error;
+
 	error = inode_change_ok(inode, iattr);
-	if (error)
+	if (unlikely(error))
 		return error;
 
-	error = inode_setattr(inode, iattr);
-	return error;
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = _do_truncate(inode, iattr->ia_size);
+		if (unlikely(error))
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
@@ -1053,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1089,8 +1067,10 @@ bad_inode:
 int __exofs_wait_obj_created(struct exofs_i_info *oi)
 {
 	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
 		BUG_ON(!obj_2bcreated(oi));
 		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
 	}
 	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
@@ -1112,7 +1092,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 	atomic_dec(&sbi->s_curr_pending);
 
 	if (unlikely(ret)) {
-		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+		EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
 			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
@@ -1124,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
 
 	set_obj_created(oi);
 
-	atomic_dec(&inode->i_count);
 	wake_up(&oi->i_wq);
 }
 
@@ -1152,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	sbi = sb->s_fs_info;
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
@@ -1174,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	ios->obj.id = exofs_oi_objno(oi);
 	exofs_make_credential(oi->i_cred, &ios->obj);
 
-	/* increment the refcount so that the inode will still be around when we
-	 * reach the callback
-	 */
-	atomic_inc(&inode->i_count);
-
 	ios->done = create_done;
 	ios->private = inode;
 	ios->cred = oi->i_cred;
 	ret = exofs_sbi_create(ios);
 	if (ret) {
-		atomic_dec(&inode->i_count);
 		exofs_put_io_state(ios);
 		return ERR_PTR(ret);
 	}
@@ -1232,7 +1206,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
 	if (!args) {
-		EXOFS_DBGMSG("Faild kzalloc of args\n");
+		EXOFS_DBGMSG("Failed kzalloc of args\n");
 		return -ENOMEM;
 	}
 
@@ -1274,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	ios->out_attr_len = 1;
 	ios->out_attr = &attr;
 
-	if (!obj_created(oi)) {
-		EXOFS_DBGMSG("!obj_created\n");
-		BUG_ON(!obj_2bcreated(oi));
-		wait_event(oi->i_wq, obj_created(oi));
-		EXOFS_DBGMSG("wait_event done\n");
-	}
+	wait_obj_created(oi);
 
 	if (!do_sync) {
 		args->sbi = sbi;
@@ -1325,7 +1294,7 @@ static void delete_done(struct exofs_io_state *ios, void *p)
  * from the OSD here.  We make sure the object was created before we try and
  * delete it.
  */
-void exofs_delete_inode(struct inode *inode)
+void exofs_evict_inode(struct inode *inode)
 {
 	struct exofs_i_info *oi = exofs_i(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1335,30 +1304,27 @@ void exofs_delete_inode(struct inode *inode)
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (is_bad_inode(inode))
+	/* TODO: should do better here */
+	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
-	mark_inode_dirty(inode);
-	exofs_update_inode(inode, inode_needs_sync(inode));
-
 	inode->i_size = 0;
-	if (inode->i_blocks)
-		exofs_truncate(inode);
+	end_writeback(inode);
 
-	clear_inode(inode);
+	/* if we are deleting an obj that hasn't been created yet, wait.
+	 * This also makes sure that create_done cannot be called with an
+	 * already evicted inode.
+	 */
+	wait_obj_created(oi);
+	/* ignore the error, attempt a remove anyway */
 
+	/* Now Remove the OSD objects */
 	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
 	}
 
-	/* if we are deleting an obj that hasn't been created yet, wait */
-	if (!obj_created(oi)) {
-		BUG_ON(!obj_2bcreated(oi));
-		wait_event(oi->i_wq, obj_created(oi));
-	}
-
 	ios->obj.id = exofs_oi_objno(oi);
 	ios->done = delete_done;
 	ios->private = sbi;
@@ -1374,5 +1340,5 @@ void exofs_delete_inode(struct inode *inode)
 	return;
 
 no_delete:
-	clear_inode(inode);
+	end_writeback(inode);
 }
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777..f74a2ec027a 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 
 	ret = osd_finalize_request(or, 0, cred, NULL);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
 		goto out;
 	}
 
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
 	 */
 	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
-		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+		EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
 			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 
 		ret = osd_finalize_request(or, 0, ios->cred, NULL);
 		if (unlikely(ret)) {
-			EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+			EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
 				     ret);
 			return ret;
 		}
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 struct _striping_info {
 	u64 obj_offset;
 	u64 group_length;
-	u64 total_group_length;
-	u64 Major;
 	unsigned dev;
 	unsigned unit_off;
 };
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
 				  (M * group_depth * stripe_unit);
 
 	si->group_length = T - H;
-	si->total_group_length = T;
-	si->Major = M;
 }
 
 static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
@@ -365,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
-			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+			EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
 			return -ENOMEM;
 		}
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 }
 
 static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
-			      struct _striping_info *si, unsigned first_comp)
+			      struct _striping_info *si)
 {
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
 	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned comp = first_comp + (dev - first_dev);
 	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
 	unsigned cur_pg = ios->pages_consumed;
 	int ret = 0;
 
 	while (length) {
-		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
 		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
 				cur_len = stripe_unit;
 			}
 
-			if (max_comp < comp)
-				max_comp = comp;
-
-			dev += mirrors_p1;
-			dev = (dev % devs_in_group) + first_dev;
+			if (max_comp < dev)
+				max_comp = dev;
 		} else {
 			cur_len = stripe_unit;
 		}
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
 		if (unlikely(ret))
 			goto out;
 
-		comp += mirrors_p1;
-		comp = (comp % devs_in_group) + first_comp;
+		dev += mirrors_p1;
+		dev = (dev % devs_in_group) + first_dev;
 
 		length -= cur_len;
 	}
@@ -454,18 +446,15 @@ out:
 static int _prepare_for_striping(struct exofs_io_state *ios)
 {
 	u64 length = ios->length;
+	u64 offset = ios->offset;
 	struct _striping_info si;
-	unsigned devs_in_group = ios->layout->group_width *
-				 ios->layout->mirrors_p1;
-	unsigned first_comp = 0;
 	int ret = 0;
 
-	_calc_stripe_info(ios, ios->offset, &si);
-
 	if (!ios->pages) {
 		if (ios->kern_buff) {
 			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
 
+			_calc_stripe_info(ios, ios->offset, &si);
 			per_dev->offset = si.obj_offset;
 			per_dev->dev = si.dev;
 
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 	}
 
 	while (length) {
+		_calc_stripe_info(ios, offset, &si);
+
 		if (length < si.group_length)
 			si.group_length = length;
 
-		ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+		ret = _prepare_one_group(ios, si.group_length, &si);
 		if (unlikely(ret))
 			goto out;
 
+		offset += si.group_length;
 		length -= si.group_length;
-
-		si.group_length = si.total_group_length;
-		si.unit_off = 0;
-		++si.Major;
-		si.obj_offset = si.Major * ios->layout->stripe_unit *
-						ios->layout->group_depth;
-
-		si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
-		si.dev %= ios->layout->s_numdevs;
-
-		first_comp += devs_in_group;
-		first_comp %= ios->layout->s_numdevs;
 	}
 
 out:
@@ -584,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
 					EXOFS_DBGMSG(
-					      "Faild to allocate BIO size=%u\n",
+					      "Failed to allocate BIO size=%u\n",
 					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
@@ -599,7 +579,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 			} else {
 				bio = master_dev->bio;
 				/* FIXME: bio_set_dir() */
-				bio->bi_rw |= (1 << BIO_RW);
+				bio->bi_rw |= REQ_WRITE;
 			}
 
 			osd_req_write(or, &ios->obj, per_dev->offset, bio,
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c23686..264e95d0283 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
 
 	inode->i_ctime = CURRENT_TIME;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 03149b9a517..79c3ae6e045 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/vfs.h>
@@ -660,19 +659,19 @@ free_bdi:
 /*
  * Set up the superblock (calls exofs_fill_super eventually)
  */
-static int exofs_get_sb(struct file_system_type *type,
+static struct dentry *exofs_mount(struct file_system_type *type,
 			  int flags, const char *dev_name,
-			  void *data, struct vfsmount *mnt)
+			  void *data)
 {
 	struct exofs_mountopt opts;
 	int ret;
 
 	ret = parse_options(data, &opts);
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	opts.dev_name = dev_name;
-	return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+	return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 
 /*
@@ -743,7 +742,7 @@ static const struct super_operations exofs_sops = {
 	.alloc_inode    = exofs_alloc_inode,
 	.destroy_inode  = exofs_destroy_inode,
 	.write_inode    = exofs_write_inode,
-	.delete_inode   = exofs_delete_inode,
+	.evict_inode    = exofs_evict_inode,
 	.put_super      = exofs_put_super,
 	.write_super    = exofs_write_super,
 	.sync_fs	= exofs_sync_fs,
@@ -810,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
 static struct file_system_type exofs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "exofs",
-	.get_sb         = exofs_get_sb,
+	.mount          = exofs_mount,
 	.kill_sb        = generic_shutdown_super,
 };
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a6..51b304056f1 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
 	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	while (!IS_ROOT(dentry) &&
-	       (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-		struct dentry *parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
+
+		if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
+			dput(parent);
+			break;
+		}
+
 		dput(dentry);
 		dentry = parent;
-		spin_lock(&dentry->d_lock);
 	}
-	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
-
 /*
  * Make sure target_dir is fully connected to the dentry tree.
  *
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e8766a39677..0d06f4e7569 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -571,7 +571,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	dquot_free_block(inode, freed);
+	dquot_free_block_nodirty(inode, freed);
 }
 
 /**
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 	return here;
 }
 
-/*
+/**
  * ext2_try_to_allocate()
  * @sb:			superblock
- * @handle:		handle to this transaction
  * @group:		given allocation block group
  * @bitmap_bh:		bufferhead holds the block bitmap
  * @grp_goal:		given target block within the group
@@ -1418,7 +1417,8 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	dquot_free_block(inode, *count-num);
+	dquot_free_block_nodirty(inode, *count-num);
+	mark_inode_dirty(inode);
 	*count = num;
 	return ret_block;
 
@@ -1428,8 +1428,10 @@ out:
 	/*
 	 * Undo the block allocation
 	 */
-	if (!performed_allocation)
-		dquot_free_block(inode, *count);
+	if (!performed_allocation) {
+		dquot_free_block_nodirty(inode, *count);
+		mark_inode_dirty(inode);
+	}
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957273e..2709b34206a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 	if (IS_DIRSYNC(dir)) {
 		err = write_one_page(page, 1);
 		if (!err)
-			err = ext2_sync_inode(dir);
+			err = sync_inode_metadata(dir, 1);
 	} else {
 		unlock_page(page);
 	}
@@ -448,6 +448,11 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 	return res;
 }
 
+static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	return __block_write_begin(page, pos, len, ext2_get_block);
+}
+
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 		   struct page *page, struct inode *inode, int update_times)
@@ -458,8 +463,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 	int err;
 
 	lock_page(page);
-	err = __ext2_write_begin(NULL, page->mapping, pos, len,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = ext2_prepare_chunk(page, pos, len);
 	BUG_ON(err);
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type(de, inode);
@@ -542,8 +546,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 got_it:
 	pos = page_offset(page) +
 		(char*)de - (char*)page_address(page);
-	err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
-							&page, NULL);
+	err = ext2_prepare_chunk(page, pos, rec_len);
 	if (err)
 		goto out_unlock;
 	if (de->inode) {
@@ -576,8 +579,7 @@ out_unlock:
  */
 int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = page->mapping->host;
 	char *kaddr = page_address(page);
 	unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
 	unsigned to = ((char *)dir - kaddr) +
@@ -601,8 +603,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
 		from = (char*)pde - (char*)page_address(page);
 	pos = page_offset(page) + from;
 	lock_page(page);
-	err = __ext2_write_begin(NULL, page->mapping, pos, to - from, 0,
-							&page, NULL);
+	err = ext2_prepare_chunk(page, pos, to - from);
 	BUG_ON(err);
 	if (pde)
 		pde->rec_len = ext2_rec_len_to_disk(to - from);
@@ -621,8 +622,7 @@ out:
  */
 int ext2_make_empty(struct inode *inode, struct inode *parent)
 {
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
 	unsigned chunk_size = ext2_chunk_size(inode);
 	struct ext2_dir_entry_2 * de;
 	int err;
@@ -631,8 +631,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
 	if (!page)
 		return -ENOMEM;
 
-	err = __ext2_write_begin(NULL, page->mapping, 0, chunk_size, 0,
-							&page, NULL);
+	err = ext2_prepare_chunk(page, 0, chunk_size);
 	if (err) {
 		unlock_page(page);
 		goto fail;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 52b34f1d273..6346a2acf32 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -119,17 +119,13 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 /* inode.c */
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
-extern void ext2_delete_inode (struct inode *);
-extern int ext2_sync_inode (struct inode *);
+extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
-int __ext2_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata);
 
 /* ioctl.c */
 extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 938dbc739d0..ad70479aabf 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -118,19 +118,14 @@ void ext2_free_inode (struct inode * inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	if (!is_bad_inode(inode)) {
-		/* Quota is already initialized in iput() */
-		ext2_xattr_delete_inode(inode);
-		dquot_free_inode(inode);
-		dquot_drop(inode);
-	}
+	/* Quota is already initialized in iput() */
+	ext2_xattr_delete_inode(inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
 
 	es = EXT2_SB(sb)->s_es;
 	is_directory = S_ISDIR(inode->i_mode);
 
-	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
-
 	if (ino < EXT2_FIRST_INO(sb) ||
 	    ino > le32_to_cpu(es->s_inodes_count)) {
 		ext2_error (sb, "ext2_free_inode",
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3675088cb88..40ad210a504 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -69,26 +69,42 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext2_delete_inode (struct inode * inode)
+void ext2_evict_inode(struct inode * inode)
 {
-	if (!is_bad_inode(inode))
+	struct ext2_block_alloc_info *rsv;
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
 		dquot_initialize(inode);
+	} else {
+		dquot_drop(inode);
+	}
+
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (is_bad_inode(inode))
-		goto no_delete;
-	EXT2_I(inode)->i_dtime	= get_seconds();
-	mark_inode_dirty(inode);
-	__ext2_write_inode(inode, inode_needs_sync(inode));
+	if (want_delete) {
+		/* set dtime */
+		EXT2_I(inode)->i_dtime	= get_seconds();
+		mark_inode_dirty(inode);
+		__ext2_write_inode(inode, inode_needs_sync(inode));
+		/* truncate to 0 */
+		inode->i_size = 0;
+		if (inode->i_blocks)
+			ext2_truncate_blocks(inode, 0);
+	}
 
-	inode->i_size = 0;
-	if (inode->i_blocks)
-		ext2_truncate_blocks(inode, 0);
-	ext2_free_inode (inode);
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 
-	return;
-no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	ext2_discard_reservation(inode);
+	rsv = EXT2_I(inode)->i_block_alloc_info;
+	EXT2_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (want_delete)
+		ext2_free_inode(inode);
 }
 
 typedef struct {
@@ -423,6 +439,8 @@ static int ext2_alloc_blocks(struct inode *inode,
 failed_out:
 	for (i = 0; i <index; i++)
 		ext2_free_blocks(inode, new_blocks[i], 1);
+	if (index)
+		mark_inode_dirty(inode);
 	return ret;
 }
 
@@ -440,7 +458,7 @@ failed_out:
  *	the same format as ext2_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
- *	picture as after the successful ext2_get_block(), excpet that in one
+ *	picture as after the successful ext2_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
@@ -644,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
 	mutex_lock(&ei->truncate_mutex);
 	/*
 	 * If the indirect block is missing while we are reading
-	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * the chain(ext2_get_branch() returns -EAGAIN err), or
 	 * if the chain has been changed after we grab the semaphore,
 	 * (either because another process truncated this branch, or
 	 * another get_block allocated this branch) re-grab the chain to see if
@@ -765,14 +783,6 @@ ext2_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
 }
 
-int __ext2_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata)
-{
-	return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, ext2_get_block);
-}
-
 static int
 ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
@@ -780,8 +790,8 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
 {
 	int ret;
 
-	*pagep = NULL;
-	ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
@@ -806,13 +816,8 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
 {
 	int ret;
 
-	/*
-	 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
-	 * directory handling code to pass around offsets rather than struct
-	 * pages in order to make this work easily.
-	 */
-	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
-						fsdata, ext2_get_block);
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+			       ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
@@ -838,7 +843,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
 				iov, offset, nr_segs, ext2_get_block, NULL);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
@@ -1006,8 +1011,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
 			else if (block_to_free == nr - count)
 				count++;
 			else {
-				mark_inode_dirty(inode);
 				ext2_free_blocks (inode, block_to_free, count);
+				mark_inode_dirty(inode);
 			free_this:
 				block_to_free = nr;
 				count = 1;
@@ -1015,8 +1020,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
 		}
 	}
 	if (count > 0) {
-		mark_inode_dirty(inode);
 		ext2_free_blocks (inode, block_to_free, count);
+		mark_inode_dirty(inode);
 	}
 }
 
@@ -1169,15 +1174,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
 	__ext2_truncate_blocks(inode, offset);
 }
 
-int ext2_setsize(struct inode *inode, loff_t newsize)
+static int ext2_setsize(struct inode *inode, loff_t newsize)
 {
-	loff_t oldsize;
 	int error;
 
-	error = inode_newsize_ok(inode, newsize);
-	if (error)
-		return error;
-
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return -EINVAL;
@@ -1197,16 +1197,13 @@ int ext2_setsize(struct inode *inode, loff_t newsize)
 	if (error)
 		return error;
 
-	oldsize = inode->i_size;
-	i_size_write(inode, newsize);
-	truncate_pagecache(inode, oldsize, newsize);
-
+	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
-		ext2_sync_inode (inode);
+		sync_inode_metadata(inode, 1);
 	} else {
 		mark_inode_dirty(inode);
 	}
@@ -1526,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
-int ext2_sync_inode(struct inode *inode)
-{
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0,	/* sys_fsync did this */
-	};
-	return sync_inode(inode, &wbc);
-}
-
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1557,7 +1545,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (error)
 			return error;
 	}
-	generic_setattr(inode, iattr);
+	setattr_copy(inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
 		error = ext2_acl_chmod(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f..f8aecd2e329 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext2_add_link(dentry, inode);
 	if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7ff43f4a59c..d89e0b6a2d7 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -195,17 +195,6 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ext2_inode_cachep);
 }
 
-static void ext2_clear_inode(struct inode *inode)
-{
-	struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
-
-	dquot_drop(inode);
-	ext2_discard_reservation(inode);
-	EXT2_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
-}
-
 static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct super_block *sb = vfs->mnt_sb;
@@ -299,13 +288,12 @@ static const struct super_operations ext2_sops = {
 	.alloc_inode	= ext2_alloc_inode,
 	.destroy_inode	= ext2_destroy_inode,
 	.write_inode	= ext2_write_inode,
-	.delete_inode	= ext2_delete_inode,
+	.evict_inode	= ext2_evict_inode,
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
 	.sync_fs	= ext2_sync_fs,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
-	.clear_inode	= ext2_clear_inode,
 	.show_options	= ext2_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext2_quota_read,
@@ -759,15 +747,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	__le32 features;
 	int err;
 
+	err = -ENOMEM;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto failed_unlock;
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
-		return -ENOMEM;
+		goto failed_unlock;
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
@@ -1119,6 +1108,7 @@ failed_sbi:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
+failed_unlock:
 	return ret;
 }
 
@@ -1231,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	}
 
 	es = sbi->s_es;
-	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
-	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
-	    invalidate_inodes(sb)) {
+	if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
 		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
 			 "xip flag with busy inodes while remounting");
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1368,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	return 0;
 }
 
-static int ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
 }
 
 #ifdef CONFIG_QUOTA
@@ -1485,7 +1473,7 @@ out:
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
-	.get_sb		= ext2_get_sb,
+	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7c3915780b1..f84700be327 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -674,6 +674,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
 				ext2_free_blocks(inode, block, 1);
+				mark_inode_dirty(inode);
 				error = -EIO;
 				goto cleanup;
 			}
@@ -698,13 +699,15 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 	EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	if (IS_SYNC(inode)) {
-		error = ext2_sync_inode (inode);
+		error = sync_inode_metadata(inode, 1);
 		/* In case sync failed due to ENOSPC the inode was actually
 		 * written (only some dirty data were not) so we just proceed
 		 * as if nothing happened and cleanup the unused block */
 		if (error && error != -ENOSPC) {
-			if (new_bh && new_bh != old_bh)
-				dquot_free_block(inode, 1);
+			if (new_bh && new_bh != old_bh) {
+				dquot_free_block_nodirty(inode, 1);
+				mark_inode_dirty(inode);
+			}
 			goto cleanup;
 		}
 	} else
@@ -727,6 +730,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				mb_cache_entry_free(ce);
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
+			mark_inode_dirty(inode);
 			/* We let our caller release old_bh, so we
 			 * need to duplicate the buffer before. */
 			get_bh(old_bh);
@@ -736,7 +740,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
 			if (ce)
 				mb_cache_entry_release(ce);
-			dquot_free_block(inode, 1);
+			dquot_free_block_nodirty(inode, 1);
+			mark_inode_dirty(inode);
 			mark_buffer_dirty(old_bh);
 			ea_bdebug(old_bh, "refcount now=%d",
 				le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -799,7 +804,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
-		dquot_free_block(inode, 1);
+		dquot_free_block_nodirty(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;
 
@@ -838,7 +843,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
 	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
 	if (!ce)
 		return -ENOMEM;
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -912,8 +917,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext2_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -945,7 +950,7 @@ again:
 			unlock_buffer(bh);
 			brelse(bh);
 		}
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1021,9 +1026,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 int __init
 init_ext2_xattr(void)
 {
-	ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
 	if (!ext2_xattr_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f4..e8c6ba0e4a3 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
 config EXT3_DEFAULTS_TO_ORDERED
 	bool "Default to 'data=ordered' in ext3"
 	depends on EXT3_FS
+	default y
 	help
 	  The journal mode options for ext3 have different tradeoffs
 	  between when data is guaranteed to be on disk and
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4de..b3db2264942 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 	if (here < 0)
 		here = 0;
 
-	p = ((char *)bh->b_data) + (here >> 3);
+	p = bh->b_data + (here >> 3);
 	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
+	next = (r - bh->b_data) << 3;
 
 	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
 		return next;
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 
 /**
  * claim_block()
+ * @lock:		the spin lock for this block group
  * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
+ * @bh:			the buffer_head contains the block group bitmap
  *
  * We think we can allocate this block in this bitmap.  Try to set the bit.
  * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
  *		but we will shift to the place where start_block is,
  *		then start from there, when looking for a reservable space.
  *
- *	@size: the target new reservation window size
+ *	@my_rsv: the reservation window
  *
- *	@group_first_block: the first block we consider to start
+ *	@sb: the super block
+ *
+ *	@start_block: the first block we consider to start
  *			the real search from
  *
  *	@last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
  *
  *	failed: we failed to find a reservation window in this group
  *
- *	@rsv: the reservation
+ *	@my_rsv: the reservation window
  *
  *	@grp_goal: The goal (group-relative).  It is where the search for a
  *		free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * @group:		given allocation block group
  * @bitmap_bh:		bufferhead holds the block bitmap
  * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
  * @my_rsv:		reservation window
+ * @count:		target number of blocks to allocate
  * @errp:		pointer to store the error code
  *
  * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a..09b13bb34c9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
 	 * storage
 	 */
 	if (needs_barrier)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 498021eb88f..9724aef2246 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -119,20 +119,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	ino = inode->i_ino;
 	ext3_debug ("freeing inode %lu\n", ino);
 
-	/*
-	 * Note: we must free any quota before locking the superblock,
-	 * as writing the quota to disk may need the lock as well.
-	 */
-	dquot_initialize(inode);
-	ext3_xattr_delete_inode(handle, inode);
-	dquot_free_inode(inode);
-	dquot_drop(inode);
-
 	is_directory = S_ISDIR(inode->i_mode);
 
-	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
-
 	es = EXT3_SB(sb)->s_es;
 	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
 		ext3_error (sb, "ext3_free_inode",
@@ -582,9 +570,14 @@ got:
 	ei->i_state_flags = 0;
 	ext3_set_inode_state(inode, EXT3_STATE_NEW);
 
-	ei->i_extra_isize =
-		(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
-		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+	/* See comment in ext3_iget for explanation */
+	if (ino >= EXT3_FIRST_INO(sb) + 1 &&
+	    EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize =
+			sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+	} else {
+		ei->i_extra_isize = 0;
+	}
 
 	ret = inode;
 	dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2..a9580617edd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -190,18 +190,28 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 }
 
 /*
- * Called at the last iput() if i_nlink is zero.
+ * Called at inode eviction from icache
  */
-void ext3_delete_inode (struct inode * inode)
+void ext3_evict_inode (struct inode *inode)
 {
+	struct ext3_block_alloc_info *rsv;
 	handle_t *handle;
+	int want_delete = 0;
 
-	if (!is_bad_inode(inode))
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		dquot_initialize(inode);
+		want_delete = 1;
+	}
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (is_bad_inode(inode))
+	ext3_discard_reservation(inode);
+	rsv = EXT3_I(inode)->i_block_alloc_info;
+	EXT3_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (!want_delete)
 		goto no_delete;
 
 	handle = start_transaction(inode);
@@ -238,15 +248,22 @@ void ext3_delete_inode (struct inode * inode)
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
-	if (ext3_mark_inode_dirty(handle, inode))
-		/* If that failed, just do the required in-core inode clear. */
-		clear_inode(inode);
-	else
+	if (ext3_mark_inode_dirty(handle, inode)) {
+		/* If that failed, just dquot_drop() and be done with that */
+		dquot_drop(inode);
+		end_writeback(inode);
+	} else {
+		ext3_xattr_delete_inode(handle, inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
+		end_writeback(inode);
 		ext3_free_inode(handle, inode);
+	}
 	ext3_journal_stop(handle);
 	return;
 no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	end_writeback(inode);
+	dquot_drop(inode);
 }
 
 typedef struct {
@@ -481,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 }
 
 /**
- *	ext3_blks_to_allocate: Look up the block map and count the number
+ *	ext3_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
@@ -519,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 }
 
 /**
- *	ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ *	ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@goal: preferred place for allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
- *
+ *	@blks:	number of blocks need to allocated for direct blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
+ *	@err: here we store the error value
+ *
+ *	return the number of direct blocks allocated
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -581,9 +602,11 @@ failed_out:
 
 /**
  *	ext3_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
@@ -683,10 +706,9 @@ failed:
 
 /**
  * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext3_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
@@ -1149,9 +1171,25 @@ static int walk_page_buffers(	handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
 					struct buffer_head *bh)
 {
+	int dirty = buffer_dirty(bh);
+	int ret;
+
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
-	return ext3_journal_get_write_access(handle, bh);
+	/*
+	 * __block_prepare_write() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_prepare_write() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	ret = ext3_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext3_journal_dirty_metadata(handle, bh);
+	return ret;
 }
 
 /*
@@ -1196,8 +1234,7 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext3_get_block);
+	ret = __block_write_begin(page, pos, len, ext3_get_block);
 	if (ret)
 		goto write_begin_failed;
 
@@ -1625,10 +1662,7 @@ static int ext3_writeback_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext3_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext3_get_block, wbc);
+	ret = block_write_full_page(page, ext3_get_block, wbc);
 
 	err = ext3_journal_stop(handle);
 	if (!ret)
@@ -1667,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext3_get_block);
+		ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+					  ext3_get_block);
 		if (ret != 0) {
 			ext3_journal_stop(handle);
 			goto out_unlock;
@@ -1785,6 +1819,17 @@ retry:
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext3_get_block, NULL);
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
@@ -1922,17 +1967,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
-	/*
-	 * For "nobh" option,  we can only work if we don't need to
-	 * read-in the page - otherwise we create buffers to do the IO.
-	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user(page, offset, length);
-		set_page_dirty(page);
-		goto unlock;
-	}
-
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 
@@ -2284,27 +2318,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 					   depth);
 
 			/*
-			 * We've probably journalled the indirect block several
-			 * times during the truncate.  But it's no longer
-			 * needed and we now drop it from the transaction via
-			 * journal_revoke().
-			 *
-			 * That's easy if it's exclusively part of this
-			 * transaction.  But if it's part of the committing
-			 * transaction then journal_forget() will simply
-			 * brelse() it.  That means that if the underlying
-			 * block is reallocated in ext3_get_block(),
-			 * unmap_underlying_metadata() will find this block
-			 * and will try to get rid of it.  damn, damn.
-			 *
-			 * If this block has already been committed to the
-			 * journal, a revoke record will be written.  And
-			 * revoke records must be emitted *before* clearing
-			 * this block's bit in the bitmaps.
-			 */
-			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-
-			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
@@ -2327,6 +2340,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 				truncate_restart_transaction(handle, inode);
 			}
 
+			/*
+			 * We've probably journalled the indirect block several
+			 * times during the truncate.  But it's no longer
+			 * needed and we now drop it from the transaction via
+			 * journal_revoke().
+			 *
+			 * That's easy if it's exclusively part of this
+			 * transaction.  But if it's part of the committing
+			 * transaction then journal_forget() will simply
+			 * brelse() it.  That means that if the underlying
+			 * block is reallocated in ext3_get_block(),
+			 * unmap_underlying_metadata() will find this block
+			 * and will try to get rid of it.  damn, damn. Thus
+			 * we don't allow a block to be reallocated until
+			 * a transaction freeing it has fully committed.
+			 *
+			 * We also have to make sure journal replay after a
+			 * crash does not overwrite non-journaled data blocks
+			 * with old metadata when the block got reallocated for
+			 * data.  Thus we have to store a revoke record for a
+			 * block in the same transaction in which we free the
+			 * block.
+			 */
+			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
 			ext3_free_blocks(handle, inode, nr, 1);
 
 			if (parent_bh) {
@@ -2497,7 +2535,6 @@ void ext3_truncate(struct inode *inode)
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
-			BUFFER_TRACE(partial->bh, "get_write_access");
 			ext3_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
@@ -2554,7 +2591,7 @@ out_stop:
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
-	 * ext3_delete_inode(), and we allow that function to clean up the
+	 * ext3_evict_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
@@ -3198,9 +3235,17 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 		ext3_journal_stop(handle);
 	}
 
-	rc = inode_setattr(inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, attr->ia_size);
+		if (rc)
+			goto err_out;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
-	if (!rc && (ia_valid & ATTR_MODE))
+	if (ia_valid & ATTR_MODE)
 		rc = ext3_acl_chmod(inode);
 
 err_out:
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca4..bce9dce639b 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 	struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	unsigned long offset;
 	struct buffer_head * bh;
 	struct ext3_dir_entry_2 *de;
 	struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 		ext3_mark_inode_dirty(handle, dir);
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
-	for (block = 0, offset = 0; block < blocks; block++) {
+	for (block = 0; block < blocks; block++) {
 		bh = ext3_bread(handle, dir, block, 0, &retval);
 		if(!bh)
 			return retval;
@@ -2261,7 +2260,7 @@ retry:
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext3_add_entry(handle, dentry, inode);
 	if (!err) {
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef..e746d30b123 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 		      ext3_fsblk_t n_blocks_count)
 {
 	ext3_fsblk_t o_blocks_count;
-	unsigned long o_groups_count;
 	ext3_grpblk_t last;
 	ext3_grpblk_t add;
 	struct buffer_head * bh;
@@ -976,10 +975,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	 * yet: we're going to revalidate es->s_blocks_count after
 	 * taking the s_resize_lock below. */
 	o_blocks_count = le32_to_cpu(es->s_blocks_count);
-	o_groups_count = EXT3_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+		       " upto "E3FSBLK" blocks\n",
 		       o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -987,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
 		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-			" too large to resize to %lu blocks safely\n",
+			" too large to resize to "E3FSBLK" blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext3_warning(sb, __func__,
@@ -1067,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
-		   o_blocks_count + add);
+	ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
-		   o_blocks_count + add);
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
+		   o_blocks_count, o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
 	if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e..2fedaf8b501 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
 	int i, err;
 
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-
-	lock_kernel();
-
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *ext3_inode_cachep;
@@ -527,17 +522,6 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ext3_inode_cachep);
 }
 
-static void ext3_clear_inode(struct inode *inode)
-{
-	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-
-	dquot_drop(inode);
-	ext3_discard_reservation(inode);
-	EXT3_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
-}
-
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
@@ -661,9 +645,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	 */
 	seq_puts(seq, ",barrier=");
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-	if (test_opt(sb, NOBH))
-		seq_puts(seq, ",nobh");
-
 	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
@@ -783,14 +764,13 @@ static const struct super_operations ext3_sops = {
 	.destroy_inode	= ext3_destroy_inode,
 	.write_inode	= ext3_write_inode,
 	.dirty_inode	= ext3_dirty_inode,
-	.delete_inode	= ext3_delete_inode,
+	.evict_inode	= ext3_evict_inode,
 	.put_super	= ext3_put_super,
 	.sync_fs	= ext3_sync_fs,
 	.freeze_fs	= ext3_freeze,
 	.unfreeze_fs	= ext3_unfreeze,
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
-	.clear_inode	= ext3_clear_inode,
 	.show_options	= ext3_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext3_quota_read,
@@ -1255,10 +1235,12 @@ set_qf_format:
 			*n_blocks_count = option;
 			break;
 		case Opt_nobh:
-			set_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated nobh option");
 			break;
 		case Opt_bh:
-			clear_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated bh option");
 			break;
 		default:
 			ext3_msg(sb, KERN_ERR,
@@ -1319,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 		ext3_msg(sb, KERN_WARNING,
 			"warning: mounting fs with errors, "
 			"running e2fsck is recommended");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
-		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+			le16_to_cpu(es->s_max_mnt_count))
 		ext3_msg(sb, KERN_WARNING,
 			"warning: maximal mount count reached, "
 			"running e2fsck is recommended");
@@ -1338,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                    valid forever! :) */
 	es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 #endif
-	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+	if (!le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
@@ -1640,8 +1622,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = EXT3_DEF_RESGID;
 	sbi->s_sb_block = sb_block;
 
-	unlock_kernel();
-
 	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1667,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 * Note: s_es must be initialized as soon as possible because
 	 *       some ext3 macro-instructions depend on its value
 	 */
-	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext3_super_block *) (bh->b_data + offset);
 	sbi->s_es = es;
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1778,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 			       "error: can't read superblock on 2nd try");
 			goto failed_mount;
 		}
-		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+		es = (struct ext3_super_block *)(bh->b_data + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
 			ext3_msg(sb, KERN_ERR,
@@ -1862,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (le32_to_cpu(es->s_blocks_count) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+	if (generic_check_addressable(sb->s_blocksize_bits,
+				      le32_to_cpu(es->s_blocks_count))) {
 		ext3_msg(sb, KERN_ERR,
 			"error: filesystem is too large to mount safely");
 		if (sizeof(sector_t) < 8)
@@ -1877,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
 			       le32_to_cpu(es->s_first_data_block) - 1)
 				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-	db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
-		   EXT3_DESC_PER_BLOCK(sb);
+	db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
 	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
 		ext3_msg(sb, KERN_ERR,
 			"error: not enough memory");
+		ret = -ENOMEM;
 		goto failed_mount;
 	}
 
@@ -1971,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	}
 	if (err) {
 		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+		ret = err;
 		goto failed_mount3;
 	}
 
@@ -2001,14 +1982,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		break;
 	}
 
-	if (test_opt(sb, NOBH)) {
-		if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-			ext3_msg(sb, KERN_WARNING,
-				"warning: ignoring nobh option - "
-				"it is supported only with writeback mode");
-			clear_opt(sbi->s_mount_opt, NOBH);
-		}
-	}
 	/*
 	 * The journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -2046,7 +2019,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	lock_kernel();
 	return 0;
 
 cantfind_ext3:
@@ -2076,7 +2048,6 @@ out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	lock_kernel();
 	return ret;
 }
 
@@ -2189,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 
-	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext3_super_block *) (bh->b_data + offset);
 	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2382,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb,
 
 	if (!sbh)
 		return error;
+
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext3_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
 	/*
 	 * If the file system is mounted read-only, don't update the
 	 * superblock write time.  This avoids updating the superblock
@@ -2398,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb,
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-	if (sync)
+	if (sync) {
 		error = sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			ext3_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
 	return error;
 }
 
@@ -2559,8 +2552,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
-	lock_kernel();
-
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -2669,7 +2660,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 
 	if (enable_quota)
 		dquot_resume(sb, -1);
@@ -2690,7 +2680,6 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	return err;
 }
 
@@ -3031,16 +3020,16 @@ out:
 
 #endif
 
-static int ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
 }
 
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
-	.get_sb		= ext3_get_sb,
+	.mount		= ext3_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 71fb8d65e54..e69dc6dfaa8 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
 		ea_bdebug(bh, "out of memory");
 		return;
 	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -1237,7 +1237,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
 int __init
 init_ext3_xattr(void)
 {
-	ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
 	if (!ext3_xattr_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5f..c947e36eda6 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index feaf498feaa..5e2ed4504ea 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 				return error;
 			else {
 				inode->i_mode = mode;
+				inode->i_ctime = ext4_current_time(inode);
 				ext4_mark_inode_dirty(handle, inode);
 				if (error == 0)
 					acl = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 95b7594c76f..14c3af26c67 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * less than the blocksize * 8 ( which is the size
 		 * of bitmap ), set rest of the block bitmap to 1
 		 */
-		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+		ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+				     bh->b_data);
 	}
 	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -377,14 +378,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	ext4_grpblk_t bit;
 	unsigned int i;
 	struct ext4_group_desc *desc;
-	struct ext4_super_block *es;
-	struct ext4_sb_info *sbi;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int err = 0, ret, blk_free_count;
 	ext4_grpblk_t blocks_freed;
 	struct ext4_group_info *grp;
 
-	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
 	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +475,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
 	if (!err)
 		err = ret;
-	sb->s_dirt = 1;
 
 error_return:
 	brelse(bitmap_bh);
@@ -493,7 +490,7 @@ error_return:
  * Check if filesystem has nblocks free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 5b6973fbf1b..fac90f3fba8 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 
 static struct kmem_cache *ext4_system_zone_cachep;
 
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
-					     SLAB_RECLAIM_ACCOUNT);
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
 	if (ext4_system_zone_cachep == NULL)
 		return -ENOMEM;
 	return 0;
 }
 
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
 	kmem_cache_destroy(ext4_system_zone_cachep);
 }
@@ -229,16 +228,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
 
 	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
 	    (start_blk + count < start_blk) ||
-	    (start_blk + count > ext4_blocks_count(sbi->s_es)))
+	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
 		return 0;
+	}
 	while (n) {
 		entry = rb_entry(n, struct ext4_system_zone, node);
 		if (start_blk + count - 1 < entry->start_blk)
 			n = n->rb_left;
 		else if (start_blk >= (entry->start_blk + entry->count))
 			n = n->rb_right;
-		else
+		else {
+			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
 			return 0;
+		}
 	}
 	return 1;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ea5e6cb7e2a..ece76fb6a40 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
 				struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ext4_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
 	.unlocked_ioctl = ext4_ioctl,
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
 
 
-int ext4_check_dir_entry(const char *function, struct inode *dir,
-			 struct ext4_dir_entry_2 *de,
-			 struct buffer_head *bh,
-			 unsigned int offset)
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+			   struct inode *dir,
+			   struct ext4_dir_entry_2 *de,
+			   struct buffer_head *bh,
+			   unsigned int offset)
 {
 	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error_inode(function, dir,
-			"bad entry in directory: %s - block=%llu"
+		ext4_error_inode(dir, function, line, bh->b_blocknr,
+			"bad entry in directory: %s - "
 			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-			error_msg, (unsigned long long) bh->b_blocknr,
-			(unsigned) (offset%bh->b_size), offset,
+			error_msg, (unsigned) (offset%bh->b_size), offset,
 			le32_to_cpu(de->inode),
 			rlen, de->name_len);
 	return error_msg == NULL ? 1 : 0;
@@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp,
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
+		ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+				      EXT4_INODE_INDEX);
 	}
 	stored = 0;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -193,7 +194,7 @@ revalidate:
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+			if (!ext4_check_dir_entry(inode, de,
 						  bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
@@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 	struct dir_private_info *info;
 	int len;
 
-	info = (struct dir_private_info *) dir_file->private_data;
+	info = dir_file->private_data;
 	p = &info->root.rb_node;
 
 	/* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de57128..8b5dd6369f8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,10 +57,13 @@
 #endif
 
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-	ext4_error_inode(__func__, (inode), (fmt), ## a)
+	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
+	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
 
 #define EXT4_ERROR_FILE(file, fmt, a...)	\
-	ext4_error_file(__func__, (file), (fmt), ## a)
+	ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
 
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -165,17 +168,42 @@ struct mpage_da_data {
 	int pages_written;
 	int retval;
 };
-#define	EXT4_IO_UNWRITTEN	0x1
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define	EXT4_IO_END_UNWRITTEN	0x0001
+#define EXT4_IO_END_ERROR	0x0002
+
+struct ext4_io_page {
+	struct page	*p_page;
+	int		p_count;
+};
+
+#define MAX_IO_PAGES 128
+
 typedef struct ext4_io_end {
-	struct list_head	list;		/* per-file finished AIO list */
+	struct list_head	list;		/* per-file finished IO list */
 	struct inode		*inode;		/* file being written to */
 	unsigned int		flag;		/* unwritten or not */
 	struct page		*page;		/* page struct for buffer write */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
 	struct work_struct	work;		/* data work queue */
+	struct kiocb		*iocb;		/* iocb struct for AIO */
+	int			result;		/* error value for AIO */
+	int			num_io_pages;
+	struct ext4_io_page	*pages[MAX_IO_PAGES];
 } ext4_io_end_t;
 
+struct ext4_io_submit {
+	int			io_op;
+	struct bio		*io_bio;
+	ext4_io_end_t		*io_end;
+	struct ext4_io_page	*io_page;
+	sector_t		io_next_block;
+};
+
 /*
  * Special inodes numbers
  */
@@ -200,6 +228,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE		1024
 #define	EXT4_MAX_BLOCK_SIZE		65536
 #define EXT4_MIN_BLOCK_LOG_SIZE		10
+#define EXT4_MAX_BLOCK_LOG_SIZE		16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
 #else
@@ -460,7 +489,7 @@ struct ext4_new_group_data {
 };
 
 /*
- * Flags used by ext4_get_blocks()
+ * Flags used by ext4_map_blocks()
  */
 	/* Allocate any needed blocks and/or convert an unitialized
 	   extent to be an initialized ext4 */
@@ -873,7 +902,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
 #define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
-#define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
@@ -885,6 +913,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
@@ -982,7 +1011,7 @@ struct ext4_super_block {
 	__le32	s_last_orphan;		/* start of list of inodes to delete */
 	__le32	s_hash_seed[4];		/* HTREE hash seed */
 	__u8	s_def_hash_version;	/* Default hash version to use */
-	__u8	s_reserved_char_pad;
+	__u8	s_jnl_backup_type;
 	__le16  s_desc_size;		/* size of group descriptor */
 /*100*/	__le32	s_default_mount_opts;
 	__le32	s_first_meta_bg;	/* First metablock block group */
@@ -1000,12 +1029,34 @@ struct ext4_super_block {
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
 	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
-	__u8	s_reserved_char_pad2;
+	__u8	s_reserved_char_pad;
 	__le16  s_reserved_pad;
 	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
-	__u32   s_reserved[160];        /* Padding to the end of the block */
+	__le32	s_snapshot_inum;	/* Inode number of active snapshot */
+	__le32	s_snapshot_id;		/* sequential ID of active snapshot */
+	__le64	s_snapshot_r_blocks_count; /* reserved blocks for active
+					      snapshot's future use */
+	__le32	s_snapshot_list;	/* inode number of the head of the
+					   on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+	__le32	s_error_count;		/* number of fs errors */
+	__le32	s_first_error_time;	/* first time an error happened */
+	__le32	s_first_error_ino;	/* inode involved in first error */
+	__le64	s_first_error_block;	/* block involved of first error */
+	__u8	s_first_error_func[32];	/* function where the error happened */
+	__le32	s_first_error_line;	/* line number where error happened */
+	__le32	s_last_error_time;	/* most recent time of an error */
+	__le32	s_last_error_ino;	/* inode involved in last error */
+	__le32	s_last_error_line;	/* line number where error happened */
+	__le64	s_last_error_block;	/* block involved of last error */
+	__u8	s_last_error_func[32];	/* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+	__u8	s_mount_opts[64];
+	__le32	s_reserved[112];        /* Padding to the end of the block */
 };
 
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
 #ifdef __KERNEL__
 
 /*
@@ -1061,7 +1112,6 @@ struct ext4_sb_info {
 	struct completion s_kobj_unregister;
 
 	/* Journaling */
-	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
@@ -1094,10 +1144,7 @@ struct ext4_sb_info {
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
 	struct inode *s_buddy_cache;
-	long s_blocks_reserved;
-	spinlock_t s_reserve_lock;
 	spinlock_t s_md_lock;
-	tid_t s_last_transaction;
 	unsigned short *s_mb_offsets;
 	unsigned int *s_mb_maxs;
 
@@ -1115,7 +1162,6 @@ struct ext4_sb_info {
 	unsigned long s_mb_last_start;
 
 	/* stats for buddy allocator */
-	spinlock_t s_mb_pa_lock;
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
 	atomic_t s_bal_success;	/* we found long enough chunks */
 	atomic_t s_bal_allocated;	/* in blocks */
@@ -1143,6 +1189,14 @@ struct ext4_sb_info {
 
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
+
+	/* timer for periodic error stats printing */
+	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1313,6 +1367,10 @@ EXT4_INODE_BIT_FNS(state, state_flags)
 #define EXT4_DEFM_JMODE_DATA	0x0020
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_WBACK	0x0060
+#define EXT4_DEFM_NOBARRIER	0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD	0x0400
+#define EXT4_DEFM_NODELALLOC	0x0800
 
 /*
  * Default journal batch times
@@ -1379,6 +1437,43 @@ struct ext4_dir_entry_2 {
 #define EXT4_MAX_REC_LEN		((1<<16)-1)
 
 /*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+#else
+	return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+	return cpu_to_le16(len);
+#endif
+}
+
+/*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
  */
@@ -1463,7 +1558,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+
+	wait_queue_head_t	li_wait_daemon;
+	wait_queue_head_t	li_wait_task;
+	struct timer_list	li_timer;
+	struct task_struct	*li_task;
+
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
+struct ext4_features {
+	struct kobject f_kobj;
+	struct completion f_kobj_unregister;
+};
 
 /*
  * Function prototypes
@@ -1491,7 +1621,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1510,9 +1639,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 		ext4_init_block_bitmap(sb, NULL, group, desc)
 
 /* dir.c */
-extern int ext4_check_dir_entry(const char *, struct inode *,
-				struct ext4_dir_entry_2 *,
-				struct buffer_head *, unsigned int);
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+				  struct ext4_dir_entry_2 *,
+				  struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, de, bh, offset) \
+	__ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
@@ -1533,11 +1664,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1548,16 +1677,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh, ext4_fsblk_t block,
 			     unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-						ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
@@ -1571,7 +1699,8 @@ extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
-extern void ext4_delete_inode(struct inode *);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
@@ -1584,13 +1713,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 /* ioctl.c */
@@ -1601,8 +1728,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern int ext4_ext_migrate(struct inode *);
 
 /* namei.c */
-extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
-extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1616,25 +1741,38 @@ extern int ext4_group_extend(struct super_block *sb,
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void __ext4_error(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-#define ext4_error(sb, message...)	__ext4_error(sb, __func__, ## message)
-extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_error_file(const char *, struct file *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error(struct super_block *, const char *, int);
-extern void ext4_abort(struct super_block *, const char *, const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_warning(struct super_block *, const char *,
+extern void __ext4_error(struct super_block *, const char *, unsigned int,
+			 const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\
+						     __LINE__, ## message)
+extern void ext4_error_inode(struct inode *, const char *, unsigned int,
+			     ext4_fsblk_t, const char *, ...)
+	__attribute__ ((format (printf, 5, 6)));
+extern void ext4_error_file(struct file *, const char *, unsigned int,
+			    const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+extern void __ext4_std_error(struct super_block *, const char *,
+			     unsigned int, int);
+extern void __ext4_abort(struct super_block *, const char *, unsigned int,
+		       const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \
+						       __LINE__, ## message)
+extern void __ext4_warning(struct super_block *, const char *, unsigned int,
 			  const char *, ...)
-	__attribute__ ((format (printf, 3, 4)));
-#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, ## message)
+	__attribute__ ((format (printf, 4, 5)));
+#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \
+						       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
-				const char *, const char *, ...)
-	__attribute__ ((format (printf, 4, 5)));
+extern void __ext4_grp_locked_error(const char *, unsigned int, \
+				    struct super_block *, ext4_group_t, \
+				    unsigned long, ext4_fsblk_t, \
+				    const char *, ...)
+	__attribute__ ((format (printf, 7, 8)));
+#define ext4_grp_locked_error(sb, grp, message...) \
+	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
@@ -1768,7 +1906,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
-		__ext4_std_error((sb), __func__, (errno));	\
+		__ext4_std_error((sb), __func__, __LINE__, (errno));	\
 } while (0)
 
 #ifdef CONFIG_SMP
@@ -1860,6 +1998,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
 	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
+static inline void ext4_mark_super_dirty(struct super_block *sb)
+{
+	if (EXT4_SB(sb)->s_journal == NULL)
+		sb->s_dirt =1;
+}
+
 /*
  * Inodes and files operations
  */
@@ -1870,6 +2014,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1883,8 +2028,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
 				 ext4_fsblk_t start_blk,
 				 unsigned int count);
@@ -1905,9 +2050,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map, int flags);
-extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
-			   sector_t block, unsigned int max_blocks,
-			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 /* move_extent.c */
@@ -1915,6 +2057,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
 
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+			       struct page *page,
+			       int len,
+			       struct writeback_control *wbc);
 
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb..28ce70fd9cd 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
 	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
 
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+					 ext4_fsblk_t pb)
+{
+	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				      0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+					 ext4_fsblk_t pb)
+{
+	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				     0xffff);
+}
+
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
 					 sector_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
 				      struct ext4_extent *ex1,
 				      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-				 struct ext4_ext_path *path,
-				 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-						ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71c..6e272ef6ba9 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
 
 #include <trace/events/ext4.h>
 
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
+				   handle_t *handle, struct buffer_head *bh)
 {
 	int err = 0;
 
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_get_undo_access(handle, bh);
 		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
+			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
 	}
 	return err;
 }
 
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh)
 {
 	int err = 0;
 
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_get_write_access(handle, bh);
 		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
+			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
 	}
 	return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
  * If the handle isn't valid we're not journaling, but we still need to
  * call into ext4_journal_revoke() to put the buffer head.
  */
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
-		  struct inode *inode, struct buffer_head *bh,
-		  ext4_fsblk_t blocknr)
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
 			BUFFER_TRACE(bh, "call jbd2_journal_forget");
 			err = jbd2_journal_forget(handle, bh);
 			if (err)
-				ext4_journal_abort_handle(where, __func__, bh,
-							  handle, err);
+				ext4_journal_abort_handle(where, line, __func__,
+							  bh, handle, err);
 			return err;
 		}
 		return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
 	BUFFER_TRACE(bh, "call jbd2_journal_revoke");
 	err = jbd2_journal_revoke(handle, blocknr, bh);
 	if (err) {
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
-		ext4_abort(inode->i_sb, __func__,
+		ext4_journal_abort_handle(where, line, __func__,
+					  bh, handle, err);
+		__ext4_abort(inode->i_sb, where, line,
 			   "error %d when attempting revoke", err);
 	}
 	BUFFER_TRACE(bh, "exit");
 	return err;
 }
 
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
 				handle_t *handle, struct buffer_head *bh)
 {
 	int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_get_create_access(handle, bh);
 		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
-						  handle, err);
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
 	}
 	return err;
 }
 
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-				 struct inode *inode, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh)
 {
 	int err = 0;
 
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
 		if (err)
-			ext4_journal_abort_handle(where, __func__, bh,
-						  handle, err);
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
 	} else {
 		if (inode)
 			mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
-				ext4_error(inode->i_sb,
-					   "IO error syncing inode, "
-					   "inode=%lu, block=%llu",
-					   inode->i_ino,
-					   (unsigned long long) bh->b_blocknr);
+				struct ext4_super_block *es;
+
+				es = EXT4_SB(inode->i_sb)->s_es;
+				es->s_last_error_block =
+					cpu_to_le64(bh->b_blocknr);
+				ext4_error_inode(inode, where, line,
+						 bh->b_blocknr,
+					"IO error syncing itable block");
 				err = -EIO;
 			}
 		}
 	}
 	return err;
 }
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb)
+{
+	struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	} else
+		sb->s_dirt = 1;
+	return err;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index dade0c02479..b0bd792c58c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
  * Wrapper functions with which ext4 calls into JBD.
  */
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+			       const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh);
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
+				   handle_t *handle, struct buffer_head *bh);
 
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh);
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh);
 
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
-		  struct inode *inode, struct buffer_head *bh,
-		  ext4_fsblk_t blocknr);
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr);
 
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
 				handle_t *handle, struct buffer_head *bh);
 
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-				 struct inode *inode, struct buffer_head *bh);
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb);
 
 #define ext4_journal_get_undo_access(handle, bh) \
-	__ext4_journal_get_undo_access(__func__, (handle), (bh))
+	__ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-	__ext4_journal_get_write_access(__func__, (handle), (bh))
+	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-	__ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
-		      (block_nr))
+	__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+		      (bh), (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
-	__ext4_journal_get_create_access(__func__, (handle), (bh))
+	__ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
-	__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
+	__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+				     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
-int __ext4_journal_stop(const char *where, handle_t *handle);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
 
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 
 #define ext4_journal_stop(handle) \
-	__ext4_journal_stop(__func__, (handle))
+	__ext4_journal_stop(__func__, __LINE__, (handle))
 
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode)
  * This function controls whether or not we should try to go down the
  * dioread_nolock code paths, which makes it safe to avoid taking
  * i_mutex for direct I/O reads.  This only works for extent-based
- * files, and it doesn't work for nobh or if data journaling is
- * enabled, since the dioread_nolock code uses b_private to pass
- * information back to the I/O completion handler, and this conflicts
- * with the jbd's use of b_private.
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
  */
 static inline int ext4_should_dioread_nolock(struct inode *inode)
 {
 	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
 		return 0;
-	if (test_opt(inode->i_sb, NOBH))
-		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 377309c1af6..0554c48cb1f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
-
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-	ext4_fsblk_t block;
-
-	block = le32_to_cpu(ex->ee_start_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-	return block;
-}
-
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-	ext4_fsblk_t block;
-
-	block = le32_to_cpu(ix->ei_leaf_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-	return block;
-}
-
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		/* try to predict block placement */
 		ex = path[depth].p_ext;
 		if (ex)
-			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+			return (ext4_ext_pblock(ex) +
+				(block - le32_to_cpu(ex->ee_block)));
 
 		/* it looks like index is empty;
 		 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext);
+	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
 
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx);
+	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -401,9 +353,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
 	return 1;
 }
 
-static int __ext4_ext_check(const char *function, struct inode *inode,
-					struct ext4_extent_header *eh,
-					int depth)
+static int __ext4_ext_check(const char *function, unsigned int line,
+			    struct inode *inode, struct ext4_extent_header *eh,
+			    int depth)
 {
 	const char *error_msg;
 	int max = 0;
@@ -436,7 +388,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return 0;
 
 corrupted:
-	ext4_error_inode(function, inode,
+	ext4_error_inode(inode, function, line, 0,
 			"bad header/extent: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			error_msg, le16_to_cpu(eh->eh_magic),
@@ -447,7 +399,7 @@ corrupted:
 }
 
 #define ext4_ext_check(inode, eh, depth)	\
-	__ext4_ext_check(__func__, inode, eh, depth)
+	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
 
 int ext4_ext_check_inode(struct inode *inode)
 {
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 	for (k = 0; k <= l; k++, path++) {
 		if (path->p_idx) {
 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-			    idx_pblock(path->p_idx));
+			    ext4_idx_pblock(path->p_idx));
 		} else if (path->p_ext) {
 			ext_debug("  %d:[%d]%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
 				  ext4_ext_is_uninitialized(path->p_ext),
 				  ext4_ext_get_actual_len(path->p_ext),
-				  ext_pblock(path->p_ext));
+				  ext4_ext_pblock(path->p_ext));
 		} else
 			ext_debug("  []");
 	}
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
 			  ext4_ext_is_uninitialized(ex),
-			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
+			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 	}
 	ext_debug("\n");
 }
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
 
 	path->p_idx = l - 1;
 	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-		  idx_pblock(path->p_idx));
+		  ext4_idx_pblock(path->p_idx));
 
 #ifdef CHECK_BINSEARCH
 	{
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
 	path->p_ext = l - 1;
 	ext_debug("  -> %d:%llu:[%d]%d ",
 			le32_to_cpu(path->p_ext->ee_block),
-			ext_pblock(path->p_ext),
+			ext4_ext_pblock(path->p_ext),
 			ext4_ext_is_uninitialized(path->p_ext),
 			ext4_ext_get_actual_len(path->p_ext));
 
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
 		ext4_ext_binsearch_idx(inode, path + ppos, block);
-		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	ext4_ext_binsearch(inode, path + ppos, block);
 	/* if not an empty leaf */
 	if (path[ppos].p_ext)
-		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -739,9 +691,9 @@ err:
  * insert new index [@logical;@ptr] into the block at @curp;
  * check where to insert: before @curp or after @curp
  */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *curp,
-				int logical, ext4_fsblk_t ptr)
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+				 struct ext4_ext_path *curp,
+				 int logical, ext4_fsblk_t ptr)
 {
 	struct ext4_extent_idx *ix;
 	int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
-				ext_pblock(path[depth].p_ext),
+				ext4_ext_pblock(path[depth].p_ext),
 				ext4_ext_is_uninitialized(path[depth].p_ext),
 				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
 					le32_to_cpu(path[i].p_idx->ei_block),
-					idx_pblock(path[i].p_idx),
+					ext4_idx_pblock(path[i].p_idx),
 					newblock);
 			/*memmove(++fidx, path[i].p_idx++,
 					sizeof(struct ext4_extent_idx));
@@ -1083,7 +1035,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
-	struct ext4_extent_idx *fidx;
 	struct buffer_head *bh;
 	ext4_fsblk_t newblock;
 	int err = 0;
@@ -1144,10 +1095,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_idx_store_pblock(curp->p_idx, newblock);
 
 	neh = ext_inode_hdr(inode);
-	fidx = EXT_FIRST_INDEX(neh);
 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
-		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
 	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
 	err = ext4_ext_dirty(handle, inode, curp);
@@ -1233,9 +1184,9 @@ out:
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_left(struct inode *inode,
+				struct ext4_ext_path *path,
+				ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
@@ -1287,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-	*phys = ext_pblock(ex) + ee_len - 1;
+	*phys = ext4_ext_pblock(ex) + ee_len - 1;
 	return 0;
 }
 
@@ -1298,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_extent_header *eh;
@@ -1343,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 			}
 		}
 		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
+		*phys = ext4_ext_pblock(ex);
 		return 0;
 	}
 
@@ -1358,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 		/* next allocated block in this leaf */
 		ex++;
 		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
+		*phys = ext4_ext_pblock(ex);
 		return 0;
 	}
 
@@ -1377,7 +1328,7 @@ got_index:
 	 * follow it and find the closest allocated
 	 * block to the right */
 	ix++;
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	while (++depth < path->p_depth) {
 		bh = sb_bread(inode->i_sb, block);
 		if (bh == NULL)
@@ -1389,7 +1340,7 @@ got_index:
 			return -EIO;
 		}
 		ix = EXT_FIRST_INDEX(eh);
-		block = idx_pblock(ix);
+		block = ext4_idx_pblock(ix);
 		put_bh(bh);
 	}
 
@@ -1403,7 +1354,7 @@ got_index:
 	}
 	ex = EXT_FIRST_EXTENT(eh);
 	*logical = le32_to_cpu(ex->ee_block);
-	*phys = ext_pblock(ex);
+	*phys = ext4_ext_pblock(ex);
 	put_bh(bh);
 	return 0;
 }
@@ -1574,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 		return 0;
 #endif
 
-	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
 		return 1;
 	return 0;
 }
@@ -1586,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-int ext4_ext_try_to_merge(struct inode *inode,
-			  struct ext4_ext_path *path,
-			  struct ext4_extent *ex)
+static int ext4_ext_try_to_merge(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *ex)
 {
 	struct ext4_extent_header *eh;
 	unsigned int depth, len;
@@ -1633,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
-				    struct ext4_extent *newext,
-				    struct ext4_ext_path *path)
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
+					   struct ext4_extent *newext,
+					   struct ext4_ext_path *path)
 {
 	ext4_lblk_t b1, b2;
 	unsigned int depth, len1;
@@ -1707,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-				ext4_ext_is_uninitialized(newext),
-				ext4_ext_get_actual_len(newext),
-				le32_to_cpu(ex->ee_block),
-				ext4_ext_is_uninitialized(ex),
-				ext4_ext_get_actual_len(ex), ext_pblock(ex));
+			  ext4_ext_is_uninitialized(newext),
+			  ext4_ext_get_actual_len(newext),
+			  le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
+			  ext4_ext_get_actual_len(ex),
+			  ext4_ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			return err;
@@ -1781,7 +1733,7 @@ has_space:
 		/* there is no extent in this leaf, create first one */
 		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
+				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1795,7 +1747,7 @@ has_space:
 			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
-					ext_pblock(newext),
+					ext4_ext_pblock(newext),
 					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
@@ -1809,7 +1761,7 @@ has_space:
 		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
+				ext4_ext_pblock(newext),
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
@@ -1820,7 +1772,7 @@ has_space:
 	le16_add_cpu(&eh->eh_entries, 1);
 	nearex = path[depth].p_ext;
 	nearex->ee_block = newext->ee_block;
-	ext4_ext_store_pblock(nearex, ext_pblock(newext));
+	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
 	nearex->ee_len = newext->ee_len;
 
 merge:
@@ -1846,9 +1798,9 @@ cleanup:
 	return err;
 }
 
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-			ext4_lblk_t num, ext_prepare_callback func,
-			void *cbdata)
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			       ext4_lblk_t num, ext_prepare_callback func,
+			       void *cbdata)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_ext_cache cbex;
@@ -1924,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 		} else {
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
 			cbex.ec_len = ext4_ext_get_actual_len(ex);
-			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_start = ext4_ext_pblock(ex);
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
@@ -2074,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 
 	/* free index block */
 	path--;
-	leaf = idx_pblock(path->p_idx);
+	leaf = ext4_idx_pblock(path->p_idx);
 	if (unlikely(path->p_hdr->eh_entries == 0)) {
 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
 		return -EIO;
@@ -2182,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t start;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		start = ext_pblock(ex) + ee_len - num;
+		start = ext4_ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %u blocks starting %llu\n", num, start);
 		ext4_free_blocks(handle, inode, 0, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
@@ -2311,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			goto out;
 
 		ext_debug("new extent: %u:%u:%llu\n", block, num,
-				ext_pblock(ex));
+				ext4_ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
 		ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2422,9 +2374,9 @@ again:
 			struct buffer_head *bh;
 			/* go to the next level */
 			ext_debug("move to level %d (block %llu)\n",
-				  i + 1, idx_pblock(path[i].p_idx));
+				  i + 1, ext4_idx_pblock(path[i].p_idx));
 			memset(path + i + 1, 0, sizeof(*path));
-			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
 			if (!bh) {
 				/* should we reset i_size? */
 				err = -EIO;
@@ -2536,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
 	int ret;
-	struct bio *bio;
-	int blkbits, blocksize;
-	sector_t ee_pblock;
-	struct completion event;
-	unsigned int ee_len, len, done, offset;
 
-
-	blkbits   = inode->i_blkbits;
-	blocksize = inode->i_sb->s_blocksize;
 	ee_len    = ext4_ext_get_actual_len(ex);
-	ee_pblock = ext_pblock(ex);
-
-	/* convert ee_pblock to 512 byte sectors */
-	ee_pblock = ee_pblock << (blkbits - 9);
-
-	while (ee_len > 0) {
-
-		if (ee_len > BIO_MAX_PAGES)
-			len = BIO_MAX_PAGES;
-		else
-			len = ee_len;
-
-		bio = bio_alloc(GFP_NOIO, len);
-		if (!bio)
-			return -ENOMEM;
-
-		bio->bi_sector = ee_pblock;
-		bio->bi_bdev   = inode->i_sb->s_bdev;
-
-		done = 0;
-		offset = 0;
-		while (done < len) {
-			ret = bio_add_page(bio, ZERO_PAGE(0),
-							blocksize, offset);
-			if (ret != blocksize) {
-				/*
-				 * We can't add any more pages because of
-				 * hardware limitations.  Start a new bio.
-				 */
-				break;
-			}
-			done++;
-			offset += blocksize;
-			if (offset >= PAGE_CACHE_SIZE)
-				offset = 0;
-		}
+	ee_pblock = ext4_ext_pblock(ex);
 
-		init_completion(&event);
-		bio->bi_private = &event;
-		bio->bi_end_io = bi_complete;
-		submit_bio(WRITE, bio);
-		wait_for_completion(&event);
+	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
 
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-			bio_put(bio);
-			return -EIO;
-		}
-		bio_put(bio);
-		ee_len    -= done;
-		ee_pblock += done  << (blkbits - 9);
-	}
-	return 0;
+	return ret;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
@@ -2652,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
 	/*
 	 * It is safe to convert extent to initialized via explicit
@@ -2676,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zeroed the full extent */
 		return allocated;
@@ -2711,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = cpu_to_le16(ee_len - allocated);
 			ext4_ext_mark_uninitialized(ex);
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 
 			ex3 = &newex;
@@ -2726,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					goto fix_extent_len;
 				ex->ee_block = orig_ex.ee_block;
 				ex->ee_len   = orig_ex.ee_len;
-				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+				ext4_ext_store_pblock(ex,
+					ext4_ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
 				/* blocks available from map->m_lblk */
 				return allocated;
@@ -2783,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
 			/* blocks available from map->m_lblk */
@@ -2834,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
 			/* blocks available from map->m_lblk */
@@ -2903,7 +2800,7 @@ insert:
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zero out the first half */
 		return allocated;
@@ -2916,7 +2813,7 @@ out:
 fix_extent_len:
 	ex->ee_block = orig_ex.ee_block;
 	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 	ext4_ext_mark_uninitialized(ex);
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
@@ -2937,7 +2834,7 @@ fix_extent_len:
  * One of more index blocks maybe needed if the extent tree grow after
  * the unintialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitilized extent called at this time will be split
+ * the IO. The uninitialized extent called at this time will be split
  * into three uninitialized extent(at most). After IO complete, the part
  * being filled will be convert to initialized by the end_io callback function
  * via ext4_convert_unwritten_extents().
@@ -2954,7 +2851,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
-	struct ext4_extent_header *eh;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
@@ -2971,17 +2867,16 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
-	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (map->m_lblk - ee_block);
-	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
-	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
 	/*
 	 * It is safe to convert extent to initialized via explicit
@@ -3030,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			/* update the extent length and mark as initialized */
 			ex->ee_block = orig_ex.ee_block;
 			ex->ee_len   = orig_ex.ee_len;
-			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
 			/* blocks available from map->m_lblk */
@@ -3058,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			err = PTR_ERR(path);
 			goto out;
 		}
-		eh = path[depth].p_hdr;
 		ex = path[depth].p_ext;
 		if (ex2 != &newex)
 			ex2 = ex;
@@ -3103,7 +2997,7 @@ insert:
 		/* update the extent length and mark as initialized */
 		ex->ee_block = orig_ex.ee_block;
 		ex->ee_len   = orig_ex.ee_len;
-		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 		ext4_ext_dirty(handle, inode, path + depth);
 		/* zero out the first half */
 		return allocated;
@@ -3116,7 +3010,7 @@ out:
 fix_extent_len:
 	ex->ee_block = orig_ex.ee_block;
 	ex->ee_len   = orig_ex.ee_len;
-	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
 	ext4_ext_mark_uninitialized(ex);
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
@@ -3184,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                 unmap_underlying_metadata(bdev, block + i);
 }
 
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      struct ext4_map_blocks *map,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (unlikely(!eh->eh_entries)) {
+		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+				 "EOFBLOCKS_FL set");
+		return -EIO;
+	}
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * We should clear the EOFBLOCKS_FL flag if we are writing the
+	 * last block in the last extent in the file.  We test this by
+	 * first checking to see if the caller to
+	 * ext4_ext_get_blocks() was interested in the last block (or
+	 * a block beyond the last block) in the current extent.  If
+	 * this turns out to be false, we can bail out from this
+	 * function immediately.
+	 */
+	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	/*
+	 * If the caller does appear to be planning to write at or
+	 * beyond the end of the current extent, we then test to see
+	 * if the current extent is the last extent in the file, by
+	 * checking to make sure it was reached via the rightmost node
+	 * at each level of the tree.
+	 */
+	for (i = depth-1; i >= 0; i--)
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			return 0;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3210,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * completed
 		 */
 		if (io)
-			io->flag = EXT4_IO_UNWRITTEN;
+			io->flag = EXT4_IO_END_UNWRITTEN;
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
@@ -3221,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
-		if (ret >= 0)
+		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map, path,
+						 map->m_len);
+		} else
+			err = ret;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3248,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0)
+	if (ret >= 0) {
 		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3296,6 +3250,7 @@ out2:
 	}
 	return err ? err : allocated;
 }
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3319,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex, *last_ex;
+	struct ext4_extent newex, *ex;
 	ext4_fsblk_t newblock;
-	int i, err = 0, depth, ret, cache_type;
+	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3345,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			/* block is already allocated */
 			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
-				   + ext_pblock(&newex);
+				   + ext4_ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = ext4_ext_get_actual_len(&newex) -
 				(map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3383,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	ex = path[depth].p_ext;
 	if (ex) {
 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-		ext4_fsblk_t ee_start = ext_pblock(ex);
+		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 		unsigned short ee_len;
 
 		/*
@@ -3492,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 			if (io)
-				io->flag = EXT4_IO_UNWRITTEN;
+				io->flag = EXT4_IO_END_UNWRITTEN;
 			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
@@ -3501,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-		if (unlikely(!eh->eh_entries)) {
-			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 and "
-					 "EOFBLOCKS_FL set");
-			err = -EIO;
-			goto out2;
-		}
-		last_ex = EXT_LAST_EXTENT(eh);
-		/*
-		 * If the current leaf block was reached by looking at
-		 * the last index block all the way down the tree, and
-		 * we are extending the inode beyond the last extent
-		 * in the current leaf block, then clear the
-		 * EOFBLOCKS_FL flag.
-		 */
-		for (i = depth-1; i >= 0; i--) {
-			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-				break;
-		}
-		if ((i < 0) &&
-		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-		     ext4_ext_get_actual_len(last_ex)))
-			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	}
+	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+	if (err)
+		goto out2;
+
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
 		ext4_discard_preallocations(inode);
-		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+		ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
 				 ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
 	}
 
 	/* previous routine could use block we allocated */
-	newblock = ext_pblock(&newex);
+	newblock = ext4_ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 	if (allocated > map->m_len)
 		allocated = map->m_len;
@@ -3733,7 +3667,7 @@ retry:
 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
-				    inode->i_ino, block, max_blocks);
+				    inode->i_ino, map.m_lblk, max_blocks);
 #endif
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5313ae4cda2..5a5c55ddcee 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		size_t length = iov_length(iov, nr_segs);
 
-		if (pos > sbi->s_bitmap_maxbytes)
+		if ((pos > sbi->s_bitmap_maxbytes ||
+		    (pos == sbi->s_bitmap_maxbytes && length > 0)))
 			return -EFBIG;
 
 		if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,14 +124,56 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		if (!IS_ERR(cp)) {
 			memcpy(sbi->s_es->s_last_mounted, cp,
 			       sizeof(sbi->s_es->s_last_mounted));
-			sb->s_dirt = 1;
+			ext4_mark_super_dirty(sb);
 		}
 	}
 	return dquot_file_open(inode, filp);
 }
 
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes;
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+	else
+		maxbytes = inode->i_sb->s_maxbytes;
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		if (offset == 0) {
+			mutex_unlock(&inode->i_mutex);
+			return file->f_pos;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > maxbytes) {
+		mutex_unlock(&inode->i_mutex);
+		return -EINVAL;
+	}
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	mutex_unlock(&inode->i_mutex);
+
+	return offset;
+}
+
 const struct file_operations ext4_file_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ext4_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546..c1a7bc923cf 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 
 #include <trace/events/ext4.h>
 
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef	EXT4_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+	unsigned long flags;
+
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+		return;
+	}
+
+	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+static int flush_completed_IO(struct inode *inode)
+{
+	ext4_io_end_t *io;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long flags;
+	int ret = 0;
+	int ret2 = 0;
+
+	if (list_empty(&ei->i_completed_io_list))
+		return ret;
+
+	dump_completed_IO(inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	while (!list_empty(&ei->i_completed_io_list)){
+		io = list_entry(ei->i_completed_io_list.next,
+				ext4_io_end_t, list);
+		/*
+		 * Calling ext4_end_io_nolock() to convert completed
+		 * IO to written.
+		 *
+		 * When ext4_sync_file() is called, run_queue() may already
+		 * about to flush the work corresponding to this io structure.
+		 * It will be upset if it founds the io structure related
+		 * to the work-to-be schedule is freed.
+		 *
+		 * Thus we need to keep the io structure still valid here after
+		 * convertion finished. The io structure has a flag to
+		 * avoid double converting from both fsync and background work
+		 * queue work.
+		 */
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		ret = ext4_end_io_nolock(io);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+		if (ret < 0)
+			ret2 = ret;
+		else
+			list_del_init(&io->list);
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	return (ret2 < 0) ? ret2 : 0;
+}
+
 /*
  * If we're not journaling and this is a just-created file, we have to
  * sync our parent directory (if it was freshly created) since
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25c4b3173fd..1ce240a23eb 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
  * need to use it within a single byte (to ensure we get endianness right).
  * We can use memset for the rest of the bitmap as there are no other users.
  */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
 	int i;
 
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-				ext4_group_t block_group,
-				struct ext4_group_desc *gdp)
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
 
 	return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return NULL;
+
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
+
 	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		return bh;
 	}
 	ext4_unlock_group(sb, block_group);
+
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -222,7 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	is_directory = S_ISDIR(inode->i_mode);
 
 	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode(inode);
+	ext4_clear_inode(inode);
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
@@ -279,7 +283,7 @@ out:
 		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!fatal)
 			fatal = err;
-		sb->s_dirt = 1;
+		ext4_mark_super_dirty(sb);
 	} else
 		ext4_error(sb, "bit already cleared for inode %lu", ino);
 
@@ -411,8 +415,8 @@ struct orlov_stats {
  * for a particular block group or flex_bg.  If flex_size is 1, then g
  * is a block group number; otherwise it is flex_bg number.
  */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-		       int flex_size, struct orlov_stats *stats)
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+			    int flex_size, struct orlov_stats *stats)
 {
 	struct ext4_group_desc *desc;
 	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
 	int free = 0, retval = 0, count;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
+	/*
+	 * We have to be sure that new inode allocation does not race with
+	 * inode table initialization, because otherwise we may end up
+	 * allocating and writing new inode right before sb_issue_zeroout
+	 * takes place and overwriting our new inode with zeroes. So we
+	 * take alloc_sem to prevent it.
+	 */
+	down_read(&grp->alloc_sem);
 	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_unlock_group(sb, group);
+		up_read(&grp->alloc_sem);
 		ext4_error(sb, "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
 	ext4_unlock_group(sb, group);
+	up_read(&grp->alloc_sem);
 	return retval;
 }
 
@@ -965,7 +980,7 @@ got:
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
-	sb->s_dirt = 1;
+	ext4_mark_super_dirty(sb);
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 	}
 	return count;
 }
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+		ext4_error(sb, "Something is wrong with group %u\n"
+			   "Used itable blocks: %d"
+			   "itable unused count: %u\n",
+			   group, used_blks,
+			   ext4_itable_unused_count(sb, gdp));
+		ret = 1;
+		goto out;
+	}
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+	if (ret < 0)
+		goto err_out;
+	if (barrier)
+		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 42272d67955..19161647046 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -167,11 +173,16 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode(struct inode *inode)
+void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
 
+	if (inode->i_nlink) {
+		truncate_inode_pages(&inode->i_data, 0);
+		goto no_delete;
+	}
+
 	if (!is_bad_inode(inode))
 		dquot_initialize(inode);
 
@@ -221,6 +232,7 @@ void ext4_delete_inode(struct inode *inode)
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
+			ext4_orphan_del(NULL, inode);
 			goto no_delete;
 		}
 	}
@@ -245,13 +257,13 @@ void ext4_delete_inode(struct inode *inode)
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
-		clear_inode(inode);
+		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
 	return;
 no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
 typedef struct {
@@ -337,9 +349,11 @@ static int ext4_block_to_path(struct inode *inode,
 	return n;
 }
 
-static int __ext4_check_blockref(const char *function, struct inode *inode,
+static int __ext4_check_blockref(const char *function, unsigned int line,
+				 struct inode *inode,
 				 __le32 *p, unsigned int max)
 {
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	__le32 *bref = p;
 	unsigned int blk;
 
@@ -348,8 +362,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
-			ext4_error_inode(function, inode,
-					 "invalid block reference %u", blk);
+			es->s_last_error_block = cpu_to_le64(blk);
+			ext4_error_inode(inode, function, line, blk,
+					 "invalid block");
 			return -EIO;
 		}
 	}
@@ -358,11 +373,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 
 
 #define ext4_check_indirect_blockref(inode, bh)                         \
-	__ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+	__ext4_check_blockref(__func__, __LINE__, inode,		\
+			      (__le32 *)(bh)->b_data,			\
 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 
 #define ext4_check_inode_blockref(inode)                                \
-	__ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+	__ext4_check_blockref(__func__, __LINE__, inode,		\
+			      EXT4_I(inode)->i_data,			\
 			      EXT4_NDIR_BLOCKS)
 
 /**
@@ -744,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -EIO;
+			goto failed;
+		}
+
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
@@ -1128,20 +1150,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, const char *func,
+static int __check_block_validity(struct inode *inode, const char *func,
+				unsigned int line,
 				struct ext4_map_blocks *map)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 				   map->m_len)) {
-		ext4_error_inode(func, inode,
-			   "lblock %lu mapped to illegal pblock %llu "
-			   "(length %d)", (unsigned long) map->m_lblk,
-				 map->m_pblk, map->m_len);
+		ext4_error_inode(inode, func, line, map->m_pblk,
+				 "lblock %lu mapped to illegal pblock "
+				 "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_len);
 		return -EIO;
 	}
 	return 0;
 }
 
+#define check_block_validity(inode, map)	\
+	__check_block_validity((inode), __func__, __LINE__, (map))
+
 /*
  * Return the number of contiguous dirty pages in a given inode
  * starting at page frame idx.
@@ -1192,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				break;
 			idx++;
 			num++;
-			if (num >= max_pages)
+			if (num >= max_pages) {
+				done = 1;
 				break;
+			}
 		}
 		pagevec_release(&pvec);
 	}
@@ -1244,7 +1272,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode, __func__, map);
+		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1324,9 +1352,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode,
-					       "ext4_map_blocks_after_alloc",
-					       map);
+		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1519,9 +1545,25 @@ static int walk_page_buffers(handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
 				       struct buffer_head *bh)
 {
+	int dirty = buffer_dirty(bh);
+	int ret;
+
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
-	return ext4_journal_get_write_access(handle, bh);
+	/*
+	 * __block_write_begin() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_write_begin() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	ret = ext4_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	return ret;
 }
 
 /*
@@ -1578,11 +1620,9 @@ retry:
 	*pagep = page;
 
 	if (ext4_should_dioread_nolock(inode))
-		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, ext4_get_block_write);
+		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 	else
-		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, ext4_get_block);
+		ret = __block_write_begin(page, pos, len, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -1593,7 +1633,7 @@ retry:
 		unlock_page(page);
 		page_cache_release(page);
 		/*
-		 * block_write_begin may have instantiated a few blocks
+		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
@@ -1968,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+			      struct ext4_map_blocks *map)
 {
-	long pages_skipped;
 	struct pagevec pvec;
 	unsigned long index, end;
 	int ret = 0, err, nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
+	loff_t size = i_size_read(inode);
+	unsigned int len, block_start;
+	struct buffer_head *bh, *page_bufs = NULL;
+	int journal_data = ext4_should_journal_data(inode);
+	sector_t pblock = 0, cur_logical = 0;
+	struct ext4_io_submit io_submit;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
+	memset(&io_submit, 0, sizeof(io_submit));
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
@@ -1993,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
+			int commit_write = 0, redirty_page = 0;
 			struct page *page = pvec.pages[i];
 
 			index = page->index;
 			if (index > end)
 				break;
+
+			if (index == size >> PAGE_CACHE_SHIFT)
+				len = size & ~PAGE_CACHE_MASK;
+			else
+				len = PAGE_CACHE_SIZE;
+			if (map) {
+				cur_logical = index << (PAGE_CACHE_SHIFT -
+							inode->i_blkbits);
+				pblock = map->m_pblk + (cur_logical -
+							map->m_lblk);
+			}
 			index++;
 
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 
-			pages_skipped = mpd->wbc->pages_skipped;
-			err = mapping->a_ops->writepage(page, mpd->wbc);
-			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-				/*
-				 * have successfully written the page
-				 * without skipping the same
-				 */
-				mpd->pages_written++;
 			/*
-			 * In error case, we have to continue because
-			 * remaining pages are still locked
-			 * XXX: unlock and re-dirty them?
+			 * If the page does not have buffers (for
+			 * whatever reason), try to create them using
+			 * __block_write_begin.  If this fails,
+			 * redirty the page and move on.
 			 */
-			if (ret == 0)
-				ret = err;
-		}
-		pagevec_release(&pvec);
-	}
-	return ret;
-}
-
-/*
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-				 struct ext4_map_blocks *map)
-{
-	struct inode *inode = mpd->inode;
-	struct address_space *mapping = inode->i_mapping;
-	int blocks = map->m_len;
-	sector_t pblock = map->m_pblk, cur_logical;
-	struct buffer_head *head, *bh;
-	pgoff_t index, end;
-	struct pagevec pvec;
-	int nr_pages, i;
-
-	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-	pagevec_init(&pvec, 0);
-
-	while (index <= end) {
-		/* XXX: optimize tail */
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			index = page->index;
-			if (index > end)
-				break;
-			index++;
-
-			BUG_ON(!PageLocked(page));
-			BUG_ON(PageWriteback(page));
-			BUG_ON(!page_has_buffers(page));
-
-			bh = page_buffers(page);
-			head = bh;
-
-			/* skip blocks out of the range */
-			do {
-				if (cur_logical >= map->m_lblk)
-					break;
-				cur_logical++;
-			} while ((bh = bh->b_this_page) != head);
+			if (!page_has_buffers(page)) {
+				if (__block_write_begin(page, 0, len,
+						noalloc_get_block_write)) {
+				redirty_page:
+					redirty_page_for_writepage(mpd->wbc,
+								   page);
+					unlock_page(page);
+					continue;
+				}
+				commit_write = 1;
+			}
 
+			bh = page_bufs = page_buffers(page);
+			block_start = 0;
 			do {
-				if (cur_logical >= map->m_lblk + blocks)
-					break;
-
-				if (buffer_delay(bh) || buffer_unwritten(bh)) {
-
-					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-
+				if (!bh)
+					goto redirty_page;
+				if (map && (cur_logical >= map->m_lblk) &&
+				    (cur_logical <= (map->m_lblk +
+						     (map->m_len - 1)))) {
 					if (buffer_delay(bh)) {
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
-					} else {
-						/*
-						 * unwritten already should have
-						 * blocknr assigned. Verify that
-						 */
-						clear_buffer_unwritten(bh);
-						BUG_ON(bh->b_blocknr != pblock);
 					}
+					if (buffer_unwritten(bh) ||
+					    buffer_mapped(bh))
+						BUG_ON(bh->b_blocknr != pblock);
+					if (map->m_flags & EXT4_MAP_UNINIT)
+						set_buffer_uninit(bh);
+					clear_buffer_unwritten(bh);
+				}
 
-				} else if (buffer_mapped(bh))
-					BUG_ON(bh->b_blocknr != pblock);
-
-				if (map->m_flags & EXT4_MAP_UNINIT)
-					set_buffer_uninit(bh);
+				/* redirty page if block allocation undone */
+				if (buffer_delay(bh) || buffer_unwritten(bh))
+					redirty_page = 1;
+				bh = bh->b_this_page;
+				block_start += bh->b_size;
 				cur_logical++;
 				pblock++;
-			} while ((bh = bh->b_this_page) != head);
+			} while (bh != page_bufs);
+
+			if (redirty_page)
+				goto redirty_page;
+
+			if (commit_write)
+				/* mark the buffer_heads as dirty & uptodate */
+				block_commit_write(page, 0, len);
+
+			/*
+			 * Delalloc doesn't support data journalling,
+			 * but eventually maybe we'll lift this
+			 * restriction.
+			 */
+			if (unlikely(journal_data && PageChecked(page)))
+				err = __ext4_journalled_writepage(page, len);
+			else
+				err = ext4_bio_write_page(&io_submit, page,
+							  len, mpd->wbc);
+
+			if (!err)
+				mpd->pages_written++;
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 */
+			if (ret == 0)
+				ret = err;
 		}
 		pagevec_release(&pvec);
 	}
+	ext4_io_submit(&io_submit);
+	return ret;
 }
 
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 					sector_t logical, long blk_cnt)
 {
@@ -2160,41 +2193,38 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
-	struct ext4_map_blocks map;
+	struct ext4_map_blocks map, *mapp = NULL;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
 	handle_t *handle = NULL;
 
 	/*
-	 * We consider only non-mapped and non-allocated blocks
-	 */
-	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-		!(mpd->b_state & (1 << BH_Delay)) &&
-		!(mpd->b_state & (1 << BH_Unwritten)))
-		return 0;
-
-	/*
-	 * If we didn't accumulate anything to write simply return
+	 * If the blocks are mapped already, or we couldn't accumulate
+	 * any blocks, then proceed immediately to the submission stage.
 	 */
-	if (!mpd->b_size)
-		return 0;
+	if ((mpd->b_size == 0) ||
+	    ((mpd->b_state  & (1 << BH_Mapped)) &&
+	     !(mpd->b_state & (1 << BH_Delay)) &&
+	     !(mpd->b_state & (1 << BH_Unwritten))))
+		goto submit_io;
 
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 
 	/*
-	 * Call ext4_get_blocks() to allocate any delayed allocation
+	 * Call ext4_map_blocks() to allocate any delayed allocation
 	 * blocks, or to convert an uninitialized extent to be
 	 * initialized (in the case where we have written into
 	 * one or more preallocated blocks).
@@ -2203,7 +2233,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * indicate that we are on the delayed allocation path.  This
 	 * affects functions in many different parts of the allocation
 	 * call path.  This flag exists primarily because we don't
-	 * want to change *many* call functions, so ext4_get_blocks()
+	 * want to change *many* call functions, so ext4_map_blocks()
 	 * will set the magic i_delalloc_reserved_flag once the
 	 * inode's allocation semaphore is taken.
 	 *
@@ -2221,19 +2251,22 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 
 	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
+		struct super_block *sb = mpd->inode->i_sb;
+
 		err = blks;
 		/*
-		 * If get block returns with error we simply
-		 * return. Later writepage will redirty the page and
-		 * writepages will find the dirty page again
+		 * If get block returns EAGAIN or ENOSPC and there
+		 * appears to be free blocks we will call
+		 * ext4_writepage() for all of the pages which will
+		 * just redirty the pages.
 		 */
 		if (err == -EAGAIN)
-			return 0;
+			goto submit_io;
 
 		if (err == -ENOSPC &&
-		    ext4_count_free_blocks(mpd->inode->i_sb)) {
+		    ext4_count_free_blocks(sb)) {
 			mpd->retval = err;
-			return 0;
+			goto submit_io;
 		}
 
 		/*
@@ -2243,24 +2276,26 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		 * writepage and writepages will again try to write
 		 * the same.
 		 */
-		ext4_msg(mpd->inode->i_sb, KERN_CRIT,
-			 "delayed block allocation failed for inode %lu at "
-			 "logical offset %llu with max blocks %zd with "
-			 "error %d", mpd->inode->i_ino,
-			 (unsigned long long) next,
-			 mpd->b_size >> mpd->inode->i_blkbits, err);
-		printk(KERN_CRIT "This should not happen!!  "
-		       "Data will be lost\n");
-		if (err == -ENOSPC) {
-			ext4_print_free_blocks(mpd->inode);
+		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+			ext4_msg(sb, KERN_CRIT,
+				 "delayed block allocation failed for inode %lu "
+				 "at logical offset %llu with max blocks %zd "
+				 "with error %d", mpd->inode->i_ino,
+				 (unsigned long long) next,
+				 mpd->b_size >> mpd->inode->i_blkbits, err);
+			ext4_msg(sb, KERN_CRIT,
+				"This should not happen!! Data will be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(mpd->inode);
 		}
 		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				mpd->b_size >> mpd->inode->i_blkbits);
-		return err;
+		return;
 	}
 	BUG_ON(blks == 0);
 
+	mapp = &map;
 	if (map.m_flags & EXT4_MAP_NEW) {
 		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
 		int i;
@@ -2269,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
 	}
 
-	/*
-	 * If blocks are delayed marked, we need to
-	 * put actual blocknr and drop delayed bit
-	 */
-	if ((mpd->b_state & (1 << BH_Delay)) ||
-	    (mpd->b_state & (1 << BH_Unwritten)))
-		mpage_put_bnr_to_bhs(mpd, &map);
-
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
 		if (err)
-			return err;
+			/* This only happens if the journal is aborted */
+			return;
 	}
 
 	/*
@@ -2291,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		disksize = i_size_read(mpd->inode);
 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
 		ext4_update_i_disksize(mpd->inode, disksize);
-		return ext4_mark_inode_dirty(handle, mpd->inode);
+		err = ext4_mark_inode_dirty(handle, mpd->inode);
+		if (err)
+			ext4_error(mpd->inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   mpd->inode->i_ino);
 	}
 
-	return 0;
+submit_io:
+	mpage_da_submit_io(mpd, mapp);
+	mpd->io_done = 1;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2320,7 +2354,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	 * XXX Don't go larger than mballoc is willing to allocate
 	 * This is a stopgap solution.  We eventually need to fold
 	 * mpage_da_submit_io() into this function and then call
-	 * ext4_get_blocks() multiple times in a loop
+	 * ext4_map_blocks() multiple times in a loop
 	 */
 	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
 		goto flush_it;
@@ -2371,9 +2405,7 @@ flush_it:
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	if (mpage_da_map_blocks(mpd) == 0)
-		mpage_da_submit_io(mpd);
-	mpd->io_done = 1;
+	mpage_da_map_and_submit(mpd);
 	return;
 }
 
@@ -2392,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
  * The function finds extents of pages and scan them for all blocks.
  */
 static int __mpage_da_writepage(struct page *page,
-				struct writeback_control *wbc, void *data)
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd)
 {
-	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
 	struct buffer_head *bh, *head;
 	sector_t logical;
@@ -2405,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
 	if (mpd->next_page != page->index) {
 		/*
 		 * Nope, we can't. So, we map non-allocated blocks
-		 * and start IO on them using writepage()
+		 * and start IO on them
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			if (mpage_da_map_blocks(mpd) == 0)
-				mpage_da_submit_io(mpd);
+			mpage_da_map_and_submit(mpd);
 			/*
 			 * skip rest of the page in the page_vec
 			 */
-			mpd->io_done = 1;
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return MPAGE_DA_EXTENT_TAIL;
@@ -2520,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		if (buffer_delay(bh))
 			return 0; /* Not sure this could or should happen */
 		/*
-		 * XXX: __block_prepare_write() unmaps passed block,
-		 * is it OK?
+		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
 		ret = ext4_da_reserve_space(inode, iblock);
 		if (ret)
@@ -2553,18 +2582,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
  * This function is used as a standard get_block_t calback function
  * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write(), nobh_writepage(), and
- * block_write_full_page().  These functions should only try to map a
- * single block at a time.
+ * callback function for block_write_begin() and block_write_full_page().
+ * These functions should only try to map a single block at a time.
  *
  * Since this function doesn't do block allocations even if the caller
  * requests it by passing in create=1, it is critically important that
  * any caller checks to make sure that any buffer heads are returned
  * by this function are either all already mapped or marked for
- * delayed allocation before calling nobh_writepage() or
- * block_write_full_page().  Otherwise, b_blocknr could be left
- * unitialized, and the page write functions will be taken by
- * surprise.
+ * delayed allocation before calling  block_write_full_page().  Otherwise,
+ * b_blocknr could be left unitialized, and the page write functions will
+ * be taken by surprise.
  */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
@@ -2595,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
+	ClearPageChecked(page);
 	page_bufs = page_buffers(page);
 	BUG_ON(!page_bufs);
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2672,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
-	int ret = 0;
+	int ret = 0, commit_write = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
@@ -2685,73 +2713,44 @@ static int ext4_writepage(struct page *page,
 	else
 		len = PAGE_CACHE_SIZE;
 
-	if (page_has_buffers(page)) {
-		page_bufs = page_buffers(page);
-		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-					ext4_bh_delay_or_unwritten)) {
-			/*
-			 * We don't want to do  block allocation
-			 * So redirty the page and return
-			 * We may reach here when we do a journal commit
-			 * via journal_submit_inode_data_buffers.
-			 * If we don't have mapping block we just ignore
-			 * them. We can also reach here via shrink_page_list
-			 */
+	/*
+	 * If the page does not have buffers (for whatever reason),
+	 * try to create them using __block_write_begin.  If this
+	 * fails, redirty the page and move on.
+	 */
+	if (!page_has_buffers(page)) {
+		if (__block_write_begin(page, 0, len,
+					noalloc_get_block_write)) {
+		redirty_page:
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
 		}
-	} else {
+		commit_write = 1;
+	}
+	page_bufs = page_buffers(page);
+	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+			      ext4_bh_delay_or_unwritten)) {
 		/*
-		 * The test for page_has_buffers() is subtle:
-		 * We know the page is dirty but it lost buffers. That means
-		 * that at some moment in time after write_begin()/write_end()
-		 * has been called all buffers have been clean and thus they
-		 * must have been written at least once. So they are all
-		 * mapped and we can happily proceed with mapping them
-		 * and writing the page.
-		 *
-		 * Try to initialize the buffer_heads and check whether
-		 * all are mapped and non delay. We don't want to
-		 * do block allocation here.
+		 * We don't want to do block allocation, so redirty
+		 * the page and return.  We may reach here when we do
+		 * a journal commit via journal_submit_inode_data_buffers.
+		 * We can also reach here via shrink_page_list
 		 */
-		ret = block_prepare_write(page, 0, len,
-					  noalloc_get_block_write);
-		if (!ret) {
-			page_bufs = page_buffers(page);
-			/* check whether all are mapped and non delay */
-			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-						ext4_bh_delay_or_unwritten)) {
-				redirty_page_for_writepage(wbc, page);
-				unlock_page(page);
-				return 0;
-			}
-		} else {
-			/*
-			 * We can't do block allocation here
-			 * so just redity the page and unlock
-			 * and return
-			 */
-			redirty_page_for_writepage(wbc, page);
-			unlock_page(page);
-			return 0;
-		}
+		goto redirty_page;
+	}
+	if (commit_write)
 		/* now mark the buffer_heads as dirty and uptodate */
 		block_commit_write(page, 0, len);
-	}
 
-	if (PageChecked(page) && ext4_should_journal_data(inode)) {
+	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
-		ClearPageChecked(page);
 		return __ext4_journalled_writepage(page, len);
-	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-	else if (page_bufs && buffer_uninit(page_bufs)) {
+	if (buffer_uninit(page_bufs)) {
 		ext4_set_bh_endio(page_bufs, inode);
 		ret = block_write_full_page_endio(page, noalloc_get_block_write,
 					    wbc, ext4_end_io_buffer_write);
@@ -2798,25 +2797,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  */
 static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
-				struct mpage_da_data *mpd)
+				struct mpage_da_data *mpd,
+				pgoff_t *done_index)
 {
 	int ret = 0;
 	int done = 0;
 	struct pagevec pvec;
-	int nr_pages;
+	unsigned nr_pages;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	long nr_to_write = wbc->nr_to_write;
+	int tag;
 
 	pagevec_init(&pvec, 0);
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
+	*done_index = index;
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			      PAGECACHE_TAG_DIRTY,
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
@@ -2836,6 +2842,8 @@ static int write_cache_pages_da(struct address_space *mapping,
 				break;
 			}
 
+			*done_index = page->index + 1;
+
 			lock_page(page);
 
 			/*
@@ -2921,6 +2929,8 @@ static int ext4_da_writepages(struct address_space *mapping,
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	pgoff_t done_index = 0;
+	pgoff_t end;
 
 	trace_ext4_da_writepages(inode, wbc);
 
@@ -2956,8 +2966,11 @@ static int ext4_da_writepages(struct address_space *mapping,
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = LLONG_MAX;
 		wbc->range_cyclic = 0;
-	} else
+		end = -1;
+	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
 
 	/*
 	 * This works around two forms of stupidity.  The first is in
@@ -2976,9 +2989,12 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 * sbi->max_writeback_mb_bump whichever is smaller.
 	 */
 	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-	if (!range_cyclic && range_whole)
-		desired_nr_to_write = wbc->nr_to_write * 8;
-	else
+	if (!range_cyclic && range_whole) {
+		if (wbc->nr_to_write == LONG_MAX)
+			desired_nr_to_write = wbc->nr_to_write;
+		else
+			desired_nr_to_write = wbc->nr_to_write * 8;
+	} else
 		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
 							   max_pages);
 	if (desired_nr_to_write > max_pages)
@@ -2995,6 +3011,9 @@ static int ext4_da_writepages(struct address_space *mapping,
 	pages_skipped = wbc->pages_skipped;
 
 retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -3033,16 +3052,14 @@ retry:
 		mpd.io_done = 0;
 		mpd.pages_written = 0;
 		mpd.retval = 0;
-		ret = write_cache_pages_da(mapping, wbc, &mpd);
+		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
 		 * them for I/O.
 		 */
 		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			if (mpage_da_map_blocks(&mpd) == 0)
-				mpage_da_submit_io(&mpd);
-			mpd.io_done = 1;
+			mpage_da_map_and_submit(&mpd);
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
 		trace_ext4_da_write_pages(inode, &mpd);
@@ -3089,14 +3106,13 @@ retry:
 			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
-	index += pages_written;
 	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = index;
+		mapping->writeback_index = done_index;
 
 out_writepages:
 	wbc->nr_to_write -= nr_to_writebump;
@@ -3146,13 +3162,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
-	unsigned from, to;
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
 
 	if (ext4_nonda_switch(inode->i_sb)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3185,8 +3198,7 @@ retry:
 	}
 	*pagep = page;
 
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				ext4_da_get_block_prep);
+	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
@@ -3435,15 +3447,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-	BUG_ON(!io);
-	if (io->page)
-		put_page(io->page);
-	iput(io->inode);
-	kfree(io);
-}
-
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh;
@@ -3545,15 +3548,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode))
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
-				 ext4_get_block, NULL);
-	else
+				 ext4_get_block, NULL, NULL, 0);
+	else {
 		ret = blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
+
+		if (unlikely((rw & WRITE) && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + iov_length(iov, nr_segs);
+
+			if (end > isize)
+				vmtruncate(inode, isize);
+		}
+	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
@@ -3611,171 +3623,9 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef	EXT4_DEBUG
-	struct list_head *cur, *before, *after;
-	ext4_io_end_t *io, *io0, *io1;
-	unsigned long flags;
-
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-		return;
-	}
-
-	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-		cur = &io->list;
-		before = cur->prev;
-		io0 = container_of(before, ext4_io_end_t, list);
-		after = cur->next;
-		io1 = container_of(after, ext4_io_end_t, list);
-
-		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-			    io, inode->i_ino, io0, io1);
-	}
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-	struct inode *inode = io->inode;
-	loff_t offset = io->offset;
-	ssize_t size = io->size;
-	int ret = 0;
-
-	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-		   "list->prev 0x%p\n",
-	           io, inode->i_ino, io->list.next, io->list.prev);
-
-	if (list_empty(&io->list))
-		return ret;
-
-	if (io->flag != EXT4_IO_UNWRITTEN)
-		return ret;
-
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		printk(KERN_EMERG "%s: failed to convert unwritten"
-			"extents to written extents, error is %d"
-			" io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-		return ret;
-	}
-
-	/* clear the DIO AIO unwritten flag */
-	io->flag = 0;
-	return ret;
-}
-
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
-	struct inode		*inode = io->inode;
-	struct ext4_inode_info	*ei = EXT4_I(inode);
-	unsigned long		flags;
-	int			ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = ext4_end_io_nolock(io);
-	if (ret < 0) {
-		mutex_unlock(&inode->i_mutex);
-		return;
-	}
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (!list_empty(&io->list))
-		list_del_init(&io->list);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	mutex_unlock(&inode->i_mutex);
-	ext4_free_io_end(io);
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-	ext4_io_end_t *io;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long flags;
-	int ret = 0;
-	int ret2 = 0;
-
-	if (list_empty(&ei->i_completed_io_list))
-		return ret;
-
-	dump_completed_IO(inode);
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	while (!list_empty(&ei->i_completed_io_list)){
-		io = list_entry(ei->i_completed_io_list.next,
-				ext4_io_end_t, list);
-		/*
-		 * Calling ext4_end_io_nolock() to convert completed
-		 * IO to written.
-		 *
-		 * When ext4_sync_file() is called, run_queue() may already
-		 * about to flush the work corresponding to this io structure.
-		 * It will be upset if it founds the io structure related
-		 * to the work-to-be schedule is freed.
-		 *
-		 * Thus we need to keep the io structure still valid here after
-		 * convertion finished. The io structure has a flag to
-		 * avoid double converting from both fsync and background work
-		 * queue work.
-		 */
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-		ret = ext4_end_io_nolock(io);
-		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-		if (ret < 0)
-			ret2 = ret;
-		else
-			list_del_init(&io->list);
-	}
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-	return (ret2 < 0) ? ret2 : 0;
-}
-
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-	ext4_io_end_t *io = NULL;
-
-	io = kmalloc(sizeof(*io), flags);
-
-	if (io) {
-		igrab(inode);
-		io->inode = inode;
-		io->flag = 0;
-		io->offset = 0;
-		io->size = 0;
-		io->page = NULL;
-		INIT_WORK(&io->work, ext4_end_io_work);
-		INIT_LIST_HEAD(&io->list);
-	}
-
-	return io;
-}
-
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-			    ssize_t size, void *private)
+			    ssize_t size, void *private, int ret,
+			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
@@ -3784,7 +3634,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
-		return;
+		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3792,25 +3642,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (io_end->flag != EXT4_IO_UNWRITTEN){
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
+out:
+		if (is_async)
+			aio_complete(iocb, ret, 0);
 		return;
 	}
 
 	io_end->offset = offset;
 	io_end->size = size;
-	io_end->flag = EXT4_IO_UNWRITTEN;
+	if (is_async) {
+		io_end->iocb = iocb;
+		io_end->result = ret;
+	}
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
-	/* queue the work to convert unwritten extents to written */
-	queue_work(wq, &io_end->work);
-
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
 	iocb->private = NULL;
 }
 
@@ -3831,7 +3687,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 		goto out;
 	}
 
-	io_end->flag = EXT4_IO_UNWRITTEN;
+	io_end->flag = EXT4_IO_END_UNWRITTEN;
 	inode = io_end->inode;
 
 	/* Add the io_end to per-inode completed io list*/
@@ -3937,7 +3793,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 				return -ENOMEM;
 			/*
 			 * we save the io structure for current async
-			 * direct IO, so that later ext4_get_blocks()
+			 * direct IO, so that later ext4_map_blocks()
 			 * could flag the io structure whether there
 			 * is a unwritten extents needs to be converted
 			 * when IO is completed.
@@ -4128,17 +3984,6 @@ int ext4_block_truncate_page(handle_t *handle,
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
-	/*
-	 * For "nobh" option,  we can only work if we don't need to
-	 * read-in the page - otherwise we create buffers to do the IO.
-	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-	     ext4_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user(page, offset, length);
-		set_page_dirty(page);
-		goto unlock;
-	}
-
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 
@@ -4488,9 +4333,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * (should be rare).
 			 */
 			if (!bh) {
-				EXT4_ERROR_INODE(inode,
-						 "Read failure block=%llu",
-						 (unsigned long long) nr);
+				EXT4_ERROR_INODE_BLOCK(inode, nr,
+						       "Read failure");
 				continue;
 			}
 
@@ -4502,27 +4346,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 					depth);
 
 			/*
-			 * We've probably journalled the indirect block several
-			 * times during the truncate.  But it's no longer
-			 * needed and we now drop it from the transaction via
-			 * jbd2_journal_revoke().
-			 *
-			 * That's easy if it's exclusively part of this
-			 * transaction.  But if it's part of the committing
-			 * transaction then jbd2_journal_forget() will simply
-			 * brelse() it.  That means that if the underlying
-			 * block is reallocated in ext4_get_block(),
-			 * unmap_underlying_metadata() will find this block
-			 * and will try to get rid of it.  damn, damn.
-			 *
-			 * If this block has already been committed to the
-			 * journal, a revoke record will be written.  And
-			 * revoke records must be emitted *before* clearing
-			 * this block's bit in the bitmaps.
-			 */
-			ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
-
-			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
@@ -4546,8 +4369,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 					    blocks_for_truncate(inode));
 			}
 
+			/*
+			 * The forget flag here is critical because if
+			 * we are journaling (and not doing data
+			 * journaling), we have to make sure a revoke
+			 * record is written to prevent the journal
+			 * replay from overwriting the (former)
+			 * indirect block if it gets reallocated as a
+			 * data block.  This must happen in the same
+			 * transaction where the data blocks are
+			 * actually freed.
+			 */
 			ext4_free_blocks(handle, inode, 0, nr, 1,
-					 EXT4_FREE_BLOCKS_METADATA);
+					 EXT4_FREE_BLOCKS_METADATA|
+					 EXT4_FREE_BLOCKS_FORGET);
 
 			if (parent_bh) {
 				/*
@@ -4805,8 +4640,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	bh = sb_getblk(sb, block);
 	if (!bh) {
-		EXT4_ERROR_INODE(inode, "unable to read inode block - "
-				 "block %llu", block);
+		EXT4_ERROR_INODE_BLOCK(inode, block,
+				       "unable to read itable block");
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -4904,8 +4739,8 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			EXT4_ERROR_INODE(inode, "unable to read inode "
-					 "block %llu", block);
+			EXT4_ERROR_INODE_BLOCK(inode, block,
+					       "unable to read itable block");
 			brelse(bh);
 			return -EIO;
 		}
@@ -4976,7 +4811,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
-		if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
@@ -5072,7 +4907,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		transaction_t *transaction;
 		tid_t tid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
@@ -5081,7 +4916,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
@@ -5126,7 +4961,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
-	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
+	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		    (S_ISLNK(inode->i_mode) &&
 		     !ext4_inode_is_fast_symlink(inode)))
@@ -5406,9 +5241,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			EXT4_ERROR_INODE(inode,
-				"IO error syncing inode (block=%llu)",
-				(unsigned long long) iloc.bh->b_blocknr);
+			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+					 "IO error syncing inode");
 			err = -EIO;
 		}
 		brelse(iloc.bh);
@@ -5444,6 +5278,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
+	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 
 	error = inode_change_ok(inode, attr);
@@ -5483,10 +5318,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
-			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
-				error = -EFBIG;
-				goto err_out;
-			}
+			if (attr->ia_size > sbi->s_bitmap_maxbytes)
+				return -EFBIG;
 		}
 	}
 
@@ -5501,8 +5334,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-
-		error = ext4_orphan_add(handle, inode);
+		if (ext4_handle_valid(handle)) {
+			error = ext4_orphan_add(handle, inode);
+			orphan = 1;
+		}
 		EXT4_I(inode)->i_disksize = attr->ia_size;
 		rc = ext4_mark_inode_dirty(handle, inode);
 		if (!error)
@@ -5520,6 +5355,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 					goto err_out;
 				}
 				ext4_orphan_del(handle, inode);
+				orphan = 0;
 				ext4_journal_stop(handle);
 				goto err_out;
 			}
@@ -5529,12 +5365,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			ext4_truncate(inode);
 	}
 
-	rc = inode_setattr(inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		rc = vmtruncate(inode, attr->ia_size);
 
-	/* If inode_setattr's call to ext4_truncate failed to get a
-	 * transaction handle at all, we need to clean up the in-core
-	 * orphan list manually. */
-	if (inode->i_nlink)
+	if (!rc) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+
+	/*
+	 * If the call to ext4_truncate failed to get a transaction handle at
+	 * all, we need to clean up the in-core orphan list manually.
+	 */
+	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
 	if (!rc && (ia_valid & ATTR_MODE))
@@ -5617,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
@@ -5688,7 +5532,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
@@ -5754,7 +5598,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_xattr_entry *entry;
 
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
@@ -5762,7 +5605,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 	raw_inode = ext4_raw_inode(&iloc);
 
 	header = IHDR(inode, raw_inode);
-	entry = IFIRST(header);
 
 	/* No extended attributes present */
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3bc026a6..c58eba34724 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES	\
+	(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -446,10 +454,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += first + i;
 			ext4_grp_locked_error(sb, e4b->bd_group,
-				   __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %u)",
-				   inode ? inode->i_ino : 0, blocknr,
-				   first + i, e4b->bd_group);
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing block already freed "
+					      "(bit %u)",
+					      first + i);
 		}
 		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 	}
@@ -712,9 +721,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		ext4_grp_locked_error(sb, group,  __func__,
-			"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
-			group, free, grp->bb_free);
+		ext4_grp_locked_error(sb, group, 0, 0,
+				      "%u blocks in bitmap, %u in gd",
+				      free, grp->bb_free);
 		/*
 		 * If we intent to continue, we consider group descritor
 		 * corrupt and update bb_free using bitmap value
@@ -938,6 +947,85 @@ out:
 }
 
 /*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+					ext4_group_t group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+
+		if ((first_group + i) >= ngroups)
+			break;
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		down_write_nested(&grp->alloc_sem, i);
+	}
+	return i;
+}
+
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+					 ext4_group_t group, int locked_group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+	/* release locks on all the groups */
+	for (i = 0; i < locked_group; i++) {
+
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		up_write(&grp->alloc_sem);
+	}
+
+}
+
+/*
  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
  * block group lock of all groups for this page; do not hold the BG lock when
  * calling this routine!
@@ -1296,10 +1384,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += block;
 			ext4_grp_locked_error(sb, e4b->bd_group,
-				   __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %u)",
-				   inode ? inode->i_ino : 0, blocknr, block,
-				   e4b->bd_group);
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing already freed block "
+					      "(bit %u)", block);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1788,8 +1876,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
-			ext4_grp_locked_error(sb, e4b->bd_group,
-					__func__, "%d free blocks as per "
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free blocks as per "
 					"group info. But bitmap says 0",
 					free);
 			break;
@@ -1798,8 +1886,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
-			ext4_grp_locked_error(sb, e4b->bd_group,
-					__func__, "%d free blocks as per "
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free blocks as per "
 					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
@@ -1821,8 +1909,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 
 /*
  * This is a special case for storages like raid5
- * we try to find stripe-aligned chunks for stripe-size requests
- * XXX should do so at least for multiples of stripe size as well
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
  */
 static noinline_for_stack
 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1915,91 +2002,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-	int i;
-	int block, pnum;
-	int blocks_per_page;
-	int groups_per_page;
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
-
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-
-	groups_per_page = blocks_per_page >> 1;
-	if (groups_per_page == 0)
-		groups_per_page = 1;
-	/* read all groups the page covers into the cache */
-	for (i = 0; i < groups_per_page; i++) {
-
-		if ((first_group + i) >= ngroups)
-			break;
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		down_write_nested(&grp->alloc_sem, i);
-	}
-	return i;
-}
-
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-					ext4_group_t group, int locked_group)
-{
-	int i;
-	int block, pnum;
-	int blocks_per_page;
-	ext4_group_t first_group;
-	struct ext4_group_info *grp;
-
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-	/* release locks on all the groups */
-	for (i = 0; i < locked_group; i++) {
-
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		up_write(&grp->alloc_sem);
-	}
-
-}
-
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
 	ext4_group_t ngroups, group, i;
 	int cr;
 	int err = 0;
-	int bsbits;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
@@ -2041,8 +2049,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			ac->ac_2order = i - 1;
 	}
 
-	bsbits = ac->ac_sb->s_blocksize_bits;
-
 	/* if stream allocation is enabled, use global goal */
 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		/* TBD: may be hot point */
@@ -2094,8 +2100,8 @@ repeat:
 			ac->ac_groups_scanned++;
 			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
-			else if (cr == 1 &&
-					ac->ac_g_ex.fe_len == sbi->s_stripe)
+			else if (cr == 1 && sbi->s_stripe &&
+					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
 				ext4_mb_scan_aligned(ac, &e4b);
 			else
 				ext4_mb_complex_scan_group(ac, &e4b);
@@ -2221,7 +2227,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 
 	rc = seq_open(file, &ext4_mb_seq_groups_ops);
 	if (rc == 0) {
-		struct seq_file *m = (struct seq_file *)file->private_data;
+		struct seq_file *m = file->private_data;
 		m->private = sb;
 	}
 	return rc;
@@ -2236,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
 	.release	= seq_release,
 };
 
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+	BUG_ON(!cachep);
+	return cachep;
+}
 
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			  struct ext4_group_desc *desc)
 {
-	int i, len;
+	int i;
 	int metalen = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_info **meta_group_info;
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
 	/*
 	 * First check if this group is the first of a reserved block.
@@ -2264,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = offsetof(typeof(**meta_group_info),
-		       bb_counters[sb->s_blocksize_bits + 2]);
-
 	meta_group_info =
 		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
-	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
 	if (meta_group_info[i] == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
 		goto exit_group_info;
 	}
+	memset(meta_group_info[i], 0, kmem_cache_size(cachep));
 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
 		&(meta_group_info[i]->bb_state));
 
@@ -2334,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	int num_meta_group_infos_max;
 	int array_size;
 	struct ext4_group_desc *desc;
+	struct kmem_cache *cachep;
 
 	/* This is the number of blocks used by GDT */
 	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2376,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
 		goto err_freesgi;
 	}
+	sbi->s_buddy_cache->i_ino = get_next_ino();
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
 	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
@@ -2391,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	return 0;
 
 err_freebuddy:
+	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 	while (i-- > 0)
-		kfree(ext4_get_group_info(sb, i));
+		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
 	i = num_meta_group_infos;
 	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
@@ -2409,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned offset;
 	unsigned max;
 	int ret;
+	int cache_index;
+	struct kmem_cache *cachep;
+	char *namep = NULL;
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		kfree(sbi->s_mb_offsets);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	cachep = ext4_groupinfo_caches[cache_index];
+	if (!cachep) {
+		char name[32];
+		int len = offsetof(struct ext4_group_info,
+					bb_counters[sb->s_blocksize_bits + 2]);
+
+		sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+		namep = kstrdup(name, GFP_KERNEL);
+		if (!namep) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* Need to free the kmem_cache_name() when we
+		 * destroy the slab */
+		cachep = kmem_cache_create(namep, len, 0,
+					     SLAB_RECLAIM_ACCOUNT, NULL);
+		if (!cachep) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ext4_groupinfo_caches[cache_index] = cachep;
 	}
 
 	/* order 0 is regular bitmap */
@@ -2442,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
 	if (ret != 0) {
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return ret;
+		goto out;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
@@ -2459,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
@@ -2478,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	if (sbi->s_journal)
 		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-	return 0;
+out:
+	if (ret) {
+		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
+		kfree(namep);
+	}
+	return ret;
 }
 
 /* need to called with the ext4 group lock held */
@@ -2506,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
@@ -2516,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_lock_group(sb, i);
 			ext4_mb_cleanup_pa(grinfo);
 			ext4_unlock_group(sb, i);
-			kfree(grinfo);
+			kmem_cache_free(cachep, grinfo);
 		}
 		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2560,6 +2605,23 @@ int ext4_mb_release(struct super_block *sb)
 	return 0;
 }
 
+static inline int ext4_issue_discard(struct super_block *sb,
+		ext4_group_t block_group, ext4_grpblk_t block, int count)
+{
+	int ret;
+	ext4_fsblk_t discard_block;
+
+	discard_block = block + ext4_group_first_block_no(sb, block_group);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long) discard_block, count);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+	if (ret == -EOPNOTSUPP) {
+		ext4_warning(sb, "discard not supported, disabling");
+		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+	}
+	return ret;
+}
+
 /*
  * This function is called by the jbd2 layer once the commit has finished,
  * so we know we can free the blocks that were released with that commit.
@@ -2579,22 +2641,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
-		if (test_opt(sb, DISCARD)) {
-			int ret;
-			ext4_fsblk_t discard_block;
-
-			discard_block = entry->start_blk +
-				ext4_group_first_block_no(sb, entry->group);
-			trace_ext4_discard_blocks(sb,
-					(unsigned long long)discard_block,
-					entry->count);
-			ret = sb_issue_discard(sb, discard_block, entry->count);
-			if (ret == EOPNOTSUPP) {
-				ext4_warning(sb,
-					"discard not supported, disabling");
-				clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-			}
-		}
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, entry->group,
+					entry->start_blk, entry->count);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
 
 #endif
 
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-	ext4_pspace_cachep =
-		kmem_cache_create("ext4_prealloc_space",
-				     sizeof(struct ext4_prealloc_space),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+					SLAB_RECLAIM_ACCOUNT);
 	if (ext4_pspace_cachep == NULL)
 		return -ENOMEM;
 
-	ext4_ac_cachep =
-		kmem_cache_create("ext4_alloc_context",
-				     sizeof(struct ext4_allocation_context),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+				    SLAB_RECLAIM_ACCOUNT);
 	if (ext4_ac_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
 
-	ext4_free_ext_cachep =
-		kmem_cache_create("ext4_free_block_extents",
-				     sizeof(struct ext4_free_data),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+					  SLAB_RECLAIM_ACCOUNT);
 	if (ext4_free_ext_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
 	return 0;
 }
 
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
+	int i;
 	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
@@ -2699,12 +2743,21 @@ void exit_ext4_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+		if (cachep) {
+			char *name = (char *)kmem_cache_name(cachep);
+			kmem_cache_destroy(cachep);
+			kfree(name);
+		}
+	}
 	ext4_remove_debugfs_entry();
 }
 
 
 /*
- * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
  * Returns 0 if success or error code
  */
 static noinline_for_stack int
@@ -2712,7 +2765,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 				handle_t *handle, unsigned int reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
-	struct ext4_super_block *es;
 	struct ext4_group_desc *gdp;
 	struct buffer_head *gdp_bh;
 	struct ext4_sb_info *sbi;
@@ -2725,8 +2777,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
-
 
 	err = -EIO;
 	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2812,7 +2862,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 
 out_err:
-	sb->s_dirt = 1;
+	ext4_mark_super_dirty(sb);
 	brelse(bitmap_bh);
 	return err;
 }
@@ -2850,7 +2900,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	int bsbits, max;
 	ext4_lblk_t end;
 	loff_t size, orig_size, start_off;
-	ext4_lblk_t start, orig_start;
+	ext4_lblk_t start;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_prealloc_space *pa;
 
@@ -2881,6 +2931,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	size = size << bsbits;
 	if (size < i_size_read(ac->ac_inode))
 		size = i_size_read(ac->ac_inode);
+	orig_size = size;
 
 	/* max size of free chunks */
 	max = 2 << bsbits;
@@ -2922,8 +2973,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
 		size	  = ac->ac_o_ex.fe_len << bsbits;
 	}
-	orig_size = size = size >> bsbits;
-	orig_start = start = start_off >> bsbits;
+	size = size >> bsbits;
+	start = start_off >> bsbits;
 
 	/* don't cover already allocated blocks in selected range */
 	if (ar->pleft && start <= ar->lleft) {
@@ -3537,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-			struct ext4_prealloc_space *pa,
-			struct ext4_allocation_context *ac)
+			struct ext4_prealloc_space *pa)
 {
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3547,7 +3597,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 	unsigned long long grp_blk_start;
-	sector_t start;
 	int err = 0;
 	int free = 0;
 
@@ -3557,32 +3606,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = pa->pa_inode;
-	}
-
 	while (bit < end) {
 		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		start = ext4_group_first_block_no(sb, group) + bit;
 		mb_debug(1, "    free preallocated %u/%u in group %u\n",
-				(unsigned) start, (unsigned) next - bit,
-				(unsigned) group);
+			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
+			 (unsigned) next - bit, (unsigned) group);
 		free += next - bit;
 
-		if (ac) {
-			ac->ac_b_ex.fe_group = group;
-			ac->ac_b_ex.fe_start = bit;
-			ac->ac_b_ex.fe_len = next - bit;
-			ac->ac_b_ex.fe_logical = 0;
-			trace_ext4_mballoc_discard(ac);
-		}
-
-		trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
-					       next - bit);
+		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+		trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+					       grp_blk_start + bit, next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3591,8 +3627,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		ext4_grp_locked_error(sb, group,
-					__func__, "free %u, pa_free %u",
+		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
 					free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
@@ -3606,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-				struct ext4_prealloc_space *pa,
-				struct ext4_allocation_context *ac)
+				struct ext4_prealloc_space *pa)
 {
 	struct super_block *sb = e4b->bd_sb;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	trace_ext4_mb_release_group_pa(ac, pa);
+	trace_ext4_mb_release_group_pa(sb, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = NULL;
-		ac->ac_b_ex.fe_group = group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = pa->pa_len;
-		ac->ac_b_ex.fe_logical = 0;
-		trace_ext4_mballoc_discard(ac);
-	}
+	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
 
 	return 0;
 }
@@ -3649,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 	struct list_head list;
 	struct ext4_buddy e4b;
 	int err;
@@ -3678,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
 
 	INIT_LIST_HEAD(&list);
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac)
-		ac->ac_sb = sb;
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3735,9 +3756,9 @@ repeat:
 		spin_unlock(pa->pa_obj_lock);
 
 		if (pa->pa_type == MB_GROUP_PA)
-			ext4_mb_release_group_pa(&e4b, pa, ac);
+			ext4_mb_release_group_pa(&e4b, pa);
 		else
-			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3745,8 +3766,6 @@ repeat:
 
 out:
 	ext4_unlock_group(sb, group);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
@@ -3767,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 	ext4_group_t group = 0;
 	struct list_head list;
 	struct ext4_buddy e4b;
@@ -3783,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
 
 	INIT_LIST_HEAD(&list);
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = inode;
-	}
 repeat:
 	/* first, collect all pa's in the inode */
 	spin_lock(&ei->i_prealloc_lock);
@@ -3857,7 +3870,7 @@ repeat:
 
 		ext4_lock_group(sb, group);
 		list_del(&pa->pa_group_list);
-		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 		ext4_unlock_group(sb, group);
 
 		ext4_mb_unload_buddy(&e4b);
@@ -3866,8 +3879,6 @@ repeat:
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -3889,6 +3900,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 	struct super_block *sb = ac->ac_sb;
 	ext4_group_t ngroups, i;
 
+	if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+		return;
+
 	printk(KERN_ERR "EXT4-fs: Can't allocate:"
 			" Allocation context details:\n");
 	printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4062,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 	struct ext4_buddy e4b;
 	struct list_head discard_list;
 	struct ext4_prealloc_space *pa, *tmp;
-	struct ext4_allocation_context *ac;
 
 	mb_debug(1, "discard locality group preallocation\n");
 
 	INIT_LIST_HEAD(&discard_list);
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac)
-		ac->ac_sb = sb;
 
 	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4121,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		}
 		ext4_lock_group(sb, group);
 		list_del(&pa->pa_group_list);
-		ext4_mb_release_group_pa(&e4b, pa, ac);
+		ext4_mb_release_group_pa(&e4b, pa);
 		ext4_unlock_group(sb, group);
 
 		ext4_mb_unload_buddy(&e4b);
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4255,7 +4263,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
  * to usual allocation
  */
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
-				 struct ext4_allocation_request *ar, int *errp)
+				struct ext4_allocation_request *ar, int *errp)
 {
 	int freed;
 	struct ext4_allocation_context *ac = NULL;
@@ -4299,7 +4307,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		inquota = ar->len;
 		if (ar->len == 0) {
 			*errp = -EDQUOT;
-			goto out3;
+			goto out;
 		}
 	}
 
@@ -4307,13 +4315,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	if (!ac) {
 		ar->len = 0;
 		*errp = -ENOMEM;
-		goto out1;
+		goto out;
 	}
 
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
-		goto out2;
+		goto out;
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4322,7 +4330,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		ext4_mb_normalize_request(ac, ar);
 repeat:
 		/* allocate space in core */
-		ext4_mb_regular_allocator(ac);
+		*errp = ext4_mb_regular_allocator(ac);
+		if (*errp)
+			goto errout;
 
 		/* as we've just preallocated more space than
 		 * user requested orinally, we store allocated
@@ -4333,7 +4343,7 @@ repeat:
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
-		if (*errp ==  -EAGAIN) {
+		if (*errp == -EAGAIN) {
 			/*
 			 * drop the reference that we took
 			 * in ext4_mb_use_best_found
@@ -4344,12 +4354,10 @@ repeat:
 			ac->ac_b_ex.fe_len = 0;
 			ac->ac_status = AC_STATUS_CONTINUE;
 			goto repeat;
-		} else if (*errp) {
+		} else if (*errp)
+		errout:
 			ext4_discard_allocated_blocks(ac);
-			ac->ac_b_ex.fe_len = 0;
-			ar->len = 0;
-			ext4_mb_show_ac(ac);
-		} else {
+		else {
 			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 			ar->len = ac->ac_b_ex.fe_len;
 		}
@@ -4358,19 +4366,19 @@ repeat:
 		if (freed)
 			goto repeat;
 		*errp = -ENOSPC;
+	}
+
+	if (*errp) {
 		ac->ac_b_ex.fe_len = 0;
 		ar->len = 0;
 		ext4_mb_show_ac(ac);
 	}
-
 	ext4_mb_release_context(ac);
-
-out2:
-	kmem_cache_free(ext4_ac_cachep, ac);
-out1:
+out:
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
 	if (inquota && ar->len < inquota)
 		dquot_free_block(ar->inode, inquota - ar->len);
-out3:
 	if (!ar->len) {
 		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
 			/* release all the reserved blocks if non delalloc */
@@ -4402,6 +4410,7 @@ static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		      struct ext4_free_data *new_entry)
 {
+	ext4_group_t group = e4b->bd_group;
 	ext4_grpblk_t block;
 	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
@@ -4434,9 +4443,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			ext4_grp_locked_error(sb, e4b->bd_group, __func__,
-					"Double free of blocks %d (%d %d)",
-					block, entry->start_blk, entry->count);
+			ext4_grp_locked_error(sb, group, 0,
+				ext4_group_first_block_no(sb, group) + block,
+				"Block already on to-be-freed list");
 			return 0;
 		}
 	}
@@ -4492,9 +4501,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
 	unsigned long freed = 0;
 	unsigned int overflow;
 	ext4_grpblk_t bit;
@@ -4513,7 +4520,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
 	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
 	    !ext4_data_block_valid(sbi, block, count)) {
 		ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4534,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
+			if (unlikely(!tbh))
+				continue;
 			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
 		}
@@ -4549,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	if (!ext4_should_writeback_data(inode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_inode = inode;
-		ac->ac_sb = sb;
-	}
-
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4612,12 +4614,7 @@ do_more:
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	if (ac) {
-		ac->ac_b_ex.fe_group = block_group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = count;
-		trace_ext4_mballoc_free(ac);
-	}
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
 
 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
 	if (err)
@@ -4643,6 +4640,8 @@ do_more:
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
+		if (test_opt(sb, DISCARD))
+			ext4_issue_discard(sb, block_group, bit, count);
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
@@ -4680,13 +4679,196 @@ do_more:
 		put_bh(bitmap_bh);
 		goto do_more;
 	}
-	sb->s_dirt = 1;
+	ext4_mark_super_dirty(sb);
 error_return:
 	if (freed)
 		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	ret = ext4_issue_discard(sb, group, start, count);
+	if (ret)
+		ext4_std_error(sb, ret);
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0;
+	ext4_group_t group;
+	int ret = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @range:		fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ext4_buddy e4b;
+	ext4_group_t first_group, last_group;
+	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t cnt = 0, first_block, last_block;
+	uint64_t start, len, minlen, trimmed;
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	len = range->len >> sb->s_blocksize_bits;
+	minlen = range->minlen >> sb->s_blocksize_bits;
+	trimmed = 0;
+
+	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	/* Determine first and last group to examine based on start and len */
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+				     &first_group, &first_block);
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+				     &last_group, &last_block);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	for (group = first_group; group <= last_group; group++) {
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
+		}
+
+		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = len;
+
+		if (e4b.bd_info->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, &e4b, first_block,
+						last_block, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
+		}
+		ext4_mb_unload_buddy(&e4b);
+		trimmed += cnt;
+		first_block = 0;
+	}
+	range->len = trimmed * sb->s_blocksize;
+
+	return ret;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6f3a27ec30b..25f3a974b72 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
 	 */
-	ei->i_flags |= EXT4_EXTENTS_FL;
+	ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
 	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
 
 	/*
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	struct buffer_head *bh;
 	struct ext4_extent_header *eh;
 
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh)
 		return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 52abfa12762..b9f3e7862f1 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 
 			/* index block */
 			path[ppos].p_idx++;
-			path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 			if (path[ppos+1].p_bh)
 				brelse(path[ppos+1].p_bh);
 			path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 				path[cur_ppos].p_idx =
 					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
 				path[cur_ppos].p_block =
-					idx_pblock(path[cur_ppos].p_idx);
+					ext4_idx_pblock(path[cur_ppos].p_idx);
 				if (path[cur_ppos+1].p_bh)
 					brelse(path[cur_ppos+1].p_bh);
 				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
 			path[leaf_ppos].p_block =
-					ext_pblock(path[leaf_ppos].p_ext);
+					ext4_ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
  */
 static int
 mext_check_null_inode(struct inode *inode1, struct inode *inode2,
-		const char *function)
+		      const char *function, unsigned int line)
 {
 	int ret = 0;
 
 	if (inode1 == NULL) {
-		__ext4_error(inode2->i_sb, function,
+		__ext4_error(inode2->i_sb, function, line,
 			"Both inodes should not be NULL: "
 			"inode1 NULL inode2 %lu", inode2->i_ino);
 		ret = -EIO;
 	} else if (inode2 == NULL) {
-		__ext4_error(inode1->i_sb, function,
+		__ext4_error(inode1->i_sb, function, line,
 			"Both inodes should not be NULL: "
 			"inode1 %lu inode2 NULL", inode1->i_ino);
 		ret = -EIO;
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 			 */
 			o_end->ee_block = end_ext->ee_block;
 			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 		}
 
 		o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		 */
 		o_end->ee_block = end_ext->ee_block;
 		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 
 		/*
 		 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
 	/* Insert new entry */
 	if (new_ext->ee_len) {
 		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
 	}
 
 	/* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	start_ext.ee_len = end_ext.ee_len = 0;
 
 	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
 	new_ext.ee_len = dext->ee_len;
 	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
 	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 		copy_extent_status(oext, &end_ext);
 		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
 		ext4_ext_store_pblock(&end_ext,
-			(ext_pblock(o_end) + oext_alen - end_ext_alen));
+			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
 		end_ext.ee_block =
 			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
 			oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 	/* When tmp_dext is too large, pick up the target range. */
 	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
 
-	ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
 	tmp_dext->ee_block =
 			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
 	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 		tmp_dext->ee_len = cpu_to_le16(max_count);
 
 	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
 
 	/* Adjust extent length if donor extent is larger than orig */
 	if (ext4_ext_get_actual_len(tmp_dext) >
@@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 
 	BUG_ON(inode1 == NULL && inode2 == NULL);
 
-	ret = mext_check_null_inode(inode1, inode2, __func__);
+	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
 	if (ret < 0)
 		goto out;
 
@@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 
 	BUG_ON(inode1 == NULL && inode2 == NULL);
 
-	ret = mext_check_null_inode(inode1, inode2, __func__);
+	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a43e6617b35..92203b8a099 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
-unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
-{
-	unsigned len = le16_to_cpu(dlen);
-
-	if (len == EXT4_MAX_REC_LEN || len == 0)
-		return blocksize;
-	return (len & 65532) | ((len & 3) << 16);
-}
-
-__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
-{
-	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
-		BUG();
-	if (len < 65536)
-		return cpu_to_le16(len);
-	if (len == blocksize) {
-		if (blocksize == 65536)
-			return cpu_to_le16(EXT4_MAX_REC_LEN);
-		else
-			return cpu_to_le16(0);
-	}
-	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-}
-
 /*
  * p is at least 6 bytes before the end of page
  */
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
 	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+		if (!ext4_check_dir_entry(dir, de, bh,
 					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
 			/* On error, skip the f_pos to the next block. */
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
 		if ((char *) de + namelen <= dlimit &&
 		    ext4_match (namelen, name, de)) {
 			/* found a match - just to be sure, do a full check */
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh, offset))
+			if (!ext4_check_dir_entry(dir, de, bh, offset))
 				return -1;
 			*res_dir = de;
 			return 1;
@@ -881,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
 	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
+	const u8 *name = d_name->name;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
@@ -895,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == '0')) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
 	if (is_dx(dir)) {
 		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
@@ -985,55 +971,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-	struct super_block * sb;
+	struct super_block * sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
-	u32 hash;
 	struct dx_frame frames[2], *frame;
-	struct ext4_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = d_name->len;
-	const u8 *name = d_name->name;
 
-	sb = dir->i_sb;
-	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-			return NULL;
-	} else {
-		frame = frames;
-		frame->bh = NULL;			/* for dx_release() */
-		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
-		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
-	}
-	hash = hinfo.hash;
+	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+		return NULL;
 	do {
 		block = dx_get_block(frame->at);
-		if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+		if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
 			goto errout;
-		de = (struct ext4_dir_entry_2 *) bh->b_data;
-		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-				  + ((char *) de - bh->b_data);
-
-			if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
-				brelse(bh);
-				*err = ERR_BAD_DX_DIR;
-				goto errout;
-			}
 
-			if (ext4_match(namelen, name, de)) {
-				*res_dir = de;
-				dx_release(frames);
-				return bh;
-			}
+		retval = search_dirblock(bh, dir, d_name,
+					 block << EXT4_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) { 	/* Success! */
+			dx_release(frames);
+			return bh;
 		}
 		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
 		/* Check to see if we should continue to search */
-		retval = ext4_htree_next_block(dir, hash, frame,
+		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
 			ext4_warning(sb,
@@ -1088,7 +1054,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
 	__u32 ino;
-	struct inode *inode;
 	static const struct qstr dotdot = {
 		.name = "..",
 		.len = 2,
@@ -1097,7 +1062,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	struct buffer_head *bh;
 
 	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-	inode = NULL;
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1305,8 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		de = (struct ext4_dir_entry_2 *)bh->b_data;
 		top = bh->b_data + blocksize - reclen;
 		while ((char *) de <= top) {
-			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-						  bh, offset))
+			if (!ext4_check_dir_entry(dir, de, bh, offset))
 				return -EIO;
 			if (ext4_match(namelen, name, de))
 				return -EEXIST;
@@ -1673,7 +1636,7 @@ static int ext4_delete_entry(handle_t *handle,
 	pde = NULL;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	while (i < bh->b_size) {
-		if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+		if (!ext4_check_dir_entry(dir, de, bh, i))
 			return -EIO;
 		if (de == de_del)  {
 			BUFFER_TRACE(bh, "get_write_access");
@@ -1956,7 +1919,7 @@ static int empty_dir(struct inode *inode)
 			}
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
 		}
-		if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+		if (!ext4_check_dir_entry(inode, de, bh, offset)) {
 			de = (struct ext4_dir_entry_2 *)(bh->b_data +
 							 sb->s_blocksize);
 			offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2340,7 +2303,7 @@ retry:
 
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_inc_count(handle, inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000000..46a7d6a9d97
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,430 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+	if (io_page_cachep == NULL)
+		return -ENOMEM;
+	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+	if (io_page_cachep == NULL) {
+		kmem_cache_destroy(io_page_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+	kmem_cache_destroy(io_end_cachep);
+	kmem_cache_destroy(io_page_cachep);
+}
+
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+	int i;
+
+	BUG_ON(!io);
+	if (io->page)
+		put_page(io->page);
+	for (i = 0; i < io->num_io_pages; i++) {
+		if (--io->pages[i]->p_count == 0) {
+			struct page *page = io->pages[i]->p_page;
+
+			end_page_writeback(page);
+			put_page(page);
+			kmem_cache_free(io_page_cachep, io->pages[i]);
+		}
+	}
+	io->num_io_pages = 0;
+	iput(io->inode);
+	kmem_cache_free(io_end_cachep, io);
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	if (list_empty(&io->list))
+		return ret;
+
+	if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+		return ret;
+
+	ret = ext4_convert_unwritten_extents(inode, offset, size);
+	if (ret < 0) {
+		printk(KERN_EMERG "%s: failed to convert unwritten "
+			"extents to written extents, error is %d "
+			"io is still on inode %lu aio dio list\n",
+		       __func__, ret, inode->i_ino);
+		return ret;
+	}
+
+	if (io->iocb)
+		aio_complete(io->iocb, io->result, 0);
+	/* clear the DIO AIO unwritten flag */
+	io->flag &= ~EXT4_IO_END_UNWRITTEN;
+	return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
+	struct inode		*inode = io->inode;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	int			ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = ext4_end_io_nolock(io);
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
+		return;
+	}
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (!list_empty(&io->list))
+		list_del_init(&io->list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+	mutex_unlock(&inode->i_mutex);
+	ext4_free_io_end(io);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = NULL;
+
+	io = kmem_cache_alloc(io_end_cachep, flags);
+	if (io) {
+		memset(io, 0, sizeof(*io));
+		io->inode = igrab(inode);
+		BUG_ON(!io->inode);
+		INIT_WORK(&io->work, ext4_end_io_work);
+		INIT_LIST_HEAD(&io->list);
+	}
+	return io;
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+	ext4_fsblk_t err_block;
+	int i;
+
+	BUG_ON(!io_end);
+	inode = io_end->inode;
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+	err_block = bio->bi_sector >> (inode->i_blkbits - 9);
+	bio_put(bio);
+
+	if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
+		pr_err("sb umounted, discard end_io request for inode %lu\n",
+			io_end->inode->i_ino);
+		ext4_free_io_end(io_end);
+		return;
+	}
+
+	if (error) {
+		io_end->flag |= EXT4_IO_END_ERROR;
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long) err_block);
+	}
+
+	for (i = 0; i < io_end->num_io_pages; i++) {
+		struct page *page = io_end->pages[i]->p_page;
+		struct buffer_head *bh, *head;
+		int partial_write = 0;
+
+		head = page_buffers(page);
+		if (error)
+			SetPageError(page);
+		BUG_ON(!head);
+		if (head->b_size == PAGE_CACHE_SIZE)
+			clear_buffer_dirty(head);
+		else {
+			loff_t offset;
+			loff_t io_end_offset = io_end->offset + io_end->size;
+
+			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+			bh = head;
+			do {
+				if ((offset >= io_end->offset) &&
+				    (offset+bh->b_size <= io_end_offset)) {
+					if (error)
+						buffer_io_error(bh);
+
+					clear_buffer_dirty(bh);
+				}
+				if (buffer_delay(bh))
+					partial_write = 1;
+				else if (!buffer_mapped(bh))
+					clear_buffer_dirty(bh);
+				else if (buffer_dirty(bh))
+					partial_write = 1;
+				offset += bh->b_size;
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+
+		if (--io_end->pages[i]->p_count == 0) {
+			struct page *page = io_end->pages[i]->p_page;
+
+			end_page_writeback(page);
+			put_page(page);
+			kmem_cache_free(io_page_cachep, io_end->pages[i]);
+		}
+
+		/*
+		 * If this is a partial write which happened to make
+		 * all buffers uptodate then we can optimize away a
+		 * bogus readpage() for the next read(). Here we
+		 * 'discover' whether the page went uptodate as a
+		 * result of this (potentially partial) write.
+		 */
+		if (!partial_write)
+			SetPageUptodate(page);
+	}
+
+	io_end->num_io_pages = 0;
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+	struct bio *bio = io->io_bio;
+
+	if (bio) {
+		bio_get(io->io_bio);
+		submit_bio(io->io_op, io->io_bio);
+		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+		bio_put(io->io_bio);
+	}
+	io->io_bio = 0;
+	io->io_op = 0;
+	io->io_end = 0;
+}
+
+static int io_submit_init(struct ext4_io_submit *io,
+			  struct inode *inode,
+			  struct writeback_control *wbc,
+			  struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	int nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio *bio;
+
+	io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_end)
+		return -ENOMEM;
+	do {
+		bio = bio_alloc(GFP_NOIO, nvecs);
+		nvecs >>= 1;
+	} while (bio == NULL);
+
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_private = io->io_end = io_end;
+	bio->bi_end_io = ext4_end_bio;
+
+	io_end->inode = inode;
+	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
+	io->io_bio = bio;
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE);
+	io->io_next_block = bh->b_blocknr;
+	return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+			    struct ext4_io_page *io_page,
+			    struct inode *inode,
+			    struct writeback_control *wbc,
+			    struct buffer_head *bh)
+{
+	ext4_io_end_t *io_end;
+	int ret;
+
+	if (buffer_new(bh)) {
+		clear_buffer_new(bh);
+		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+	}
+
+	if (!buffer_mapped(bh) || buffer_delay(bh)) {
+		if (!buffer_mapped(bh))
+			clear_buffer_dirty(bh);
+		if (io->io_bio)
+			ext4_io_submit(io);
+		return 0;
+	}
+
+	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+		ext4_io_submit(io);
+	}
+	if (io->io_bio == NULL) {
+		ret = io_submit_init(io, inode, wbc, bh);
+		if (ret)
+			return ret;
+	}
+	io_end = io->io_end;
+	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+	    (io_end->pages[io_end->num_io_pages-1] != io_page))
+		goto submit_and_retry;
+	if (buffer_uninit(bh))
+		io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+	io->io_end->size += bh->b_size;
+	io->io_next_block++;
+	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
+	if ((io_end->num_io_pages == 0) ||
+	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+		io_end->pages[io_end->num_io_pages++] = io_page;
+		io_page->p_count++;
+	}
+	return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+			struct page *page,
+			int len,
+			struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	struct ext4_io_page *io_page;
+	struct buffer_head *bh, *head;
+	int ret = 0;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	ClearPageError(page);
+
+	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+	if (!io_page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	io_page->p_page = page;
+	io_page->p_count = 0;
+	get_page(page);
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_start >= len) {
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+		if (ret) {
+			/*
+			 * We only get here on ENOMEM.  Not much else
+			 * we can do but mark the page as dirty, and
+			 * better luck next time.
+			 */
+			set_page_dirty(page);
+			break;
+		}
+	}
+	unlock_page(page);
+	/*
+	 * If the page was truncated before we could do the writeback,
+	 * or we had a memory allocation error while trying to write
+	 * the first buffer head, we won't have submitted any pages for
+	 * I/O.  In that case we need to make sure we've cleared the
+	 * PageWriteback bit from the page to prevent the system from
+	 * wedging later on.
+	 */
+	if (io_page->p_count == 0) {
+		put_page(page);
+		end_page_writeback(page);
+		kmem_cache_free(io_page_cachep, io_page);
+	}
+	return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6df797eb9ae..dc963929de6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
 	}
 
 	/* Zero out all of the reserved backup group descriptor table blocks */
-	for (i = 0, bit = gdblocks + 1, block = start + bit;
-	     i < reserved_gdb; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
+	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+			       GFP_NOFS);
+	if (err)
+		goto exit_bh;
 
-		if (IS_ERR(gdb = bclean(handle, sb, block))) {
-			err = PTR_ERR(gdb);
-			goto exit_bh;
-		}
-		ext4_handle_dirty_metadata(handle, NULL, gdb);
-		ext4_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
 	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
 		   input->block_bitmap - start);
 	ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
 	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
 
 	/* Zero out all of the inode table blocks */
-	for (i = 0, block = input->inode_table, bit = block - start;
-	     i < sbi->s_itb_per_group; i++, bit++, block++) {
-		struct buffer_head *it;
-
-		ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
-
-		if (IS_ERR(it = bclean(handle, sb, block))) {
-			err = PTR_ERR(it);
-			goto exit_bh;
-		}
-		ext4_handle_dirty_metadata(handle, NULL, it);
-		brelse(it);
-		ext4_set_bit(bit, bh->b_data);
-	}
+	block = input->inode_table;
+	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+			block, sbi->s_itb_per_group);
+	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+	if (err)
+		goto exit_bh;
 
 	if ((err = extend_or_restart_transaction(handle, 2, bh)))
 		goto exit_bh;
 
-	mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+	ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+			     bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 	/* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
 		goto exit_journal;
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-			bh->b_data);
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+			     bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
 	brelse(bh);
@@ -921,8 +901,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
-	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
-	sb->s_dirt = 1;
+	ext4_handle_dirty_super(handle, sb);
 
 exit_journal:
 	mutex_unlock(&sbi->s_resize_lock);
@@ -953,7 +932,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		      ext4_fsblk_t n_blocks_count)
 {
 	ext4_fsblk_t o_blocks_count;
-	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head *bh;
@@ -965,7 +943,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	 * yet: we're going to revalidate es->s_blocks_count after
 	 * taking the s_resize_lock below. */
 	o_blocks_count = ext4_blocks_count(es);
-	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1045,13 +1022,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
-	sb->s_dirt = 1;
 	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	/* We add the blocks to the bitmap and set the group need init bit */
 	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+	ext4_handle_dirty_super(handle, sb);
 	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e8983a9811..40131b777af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -41,6 +40,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -50,8 +52,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -241,14 +248,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
-	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	vfs_check_frozen(sb, SB_FREEZE_TRANS);
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
 		if (is_journal_aborted(journal)) {
-			ext4_abort(sb, __func__, "Detected aborted journal");
+			ext4_abort(sb, "Detected aborted journal");
 			return ERR_PTR(-EROFS);
 		}
 		return jbd2_journal_start(journal, nblocks);
@@ -262,7 +269,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
  * that sync() will call the filesystem's write_super callback if
  * appropriate.
  */
-int __ext4_journal_stop(const char *where, handle_t *handle)
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 {
 	struct super_block *sb;
 	int err;
@@ -279,12 +286,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
 	if (!err)
 		err = rc;
 	if (err)
-		__ext4_std_error(sb, where, err);
+		__ext4_std_error(sb, where, line, err);
 	return err;
 }
 
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err)
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+			       const char *err_fn, struct buffer_head *bh,
+			       handle_t *handle, int err)
 {
 	char nbuf[16];
 	const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -300,12 +308,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 	if (is_handle_aborted(handle))
 		return;
 
-	printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
-	       caller, errstr, err_fn);
+	printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+	       caller, line, errstr, err_fn);
 
 	jbd2_journal_abort_handle(handle);
 }
 
+static void __save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+	es->s_last_error_time = cpu_to_le32(get_seconds());
+	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+	es->s_last_error_line = cpu_to_le32(line);
+	if (!es->s_first_error_time) {
+		es->s_first_error_time = es->s_last_error_time;
+		strncpy(es->s_first_error_func, func,
+			sizeof(es->s_first_error_func));
+		es->s_first_error_line = cpu_to_le32(line);
+		es->s_first_error_ino = es->s_last_error_ino;
+		es->s_first_error_block = es->s_last_error_block;
+	}
+	/*
+	 * Start the daily error reporting function if it hasn't been
+	 * started already
+	 */
+	if (!es->s_error_count)
+		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+	es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+}
+
+static void save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	__save_error_info(sb, func, line);
+	ext4_commit_super(sb, 1);
+}
+
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -323,11 +366,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 
 static void ext4_handle_error(struct super_block *sb)
 {
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
-	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
@@ -342,19 +380,19 @@ static void ext4_handle_error(struct super_block *sb)
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
-	ext4_commit_super(sb, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
 }
 
 void __ext4_error(struct super_block *sb, const char *function,
-		const char *fmt, ...)
+		  unsigned int line, const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+	       sb->s_id, function, line, current->comm);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -362,14 +400,22 @@ void __ext4_error(struct super_block *sb, const char *function,
 	ext4_handle_error(sb);
 }
 
-void ext4_error_inode(const char *function, struct inode *inode,
+void ext4_error_inode(struct inode *inode, const char *function,
+		      unsigned int line, ext4_fsblk_t block,
 		      const char *fmt, ...)
 {
 	va_list args;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	save_error_info(inode->i_sb, function, line);
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
-	       inode->i_sb->s_id, function, inode->i_ino, current->comm);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
+	       inode->i_sb->s_id, function, line, inode->i_ino);
+	if (block)
+		printk("block %llu: ", block);
+	printk("comm %s: ", current->comm);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -377,20 +423,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
 	ext4_handle_error(inode->i_sb);
 }
 
-void ext4_error_file(const char *function, struct file *file,
-		     const char *fmt, ...)
+void ext4_error_file(struct file *file, const char *function,
+		     unsigned int line, const char *fmt, ...)
 {
 	va_list args;
+	struct ext4_super_block *es;
 	struct inode *inode = file->f_dentry->d_inode;
 	char pathname[80], *path;
 
+	es = EXT4_SB(inode->i_sb)->s_es;
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	save_error_info(inode->i_sb, function, line);
 	va_start(args, fmt);
 	path = d_path(&(file->f_path), pathname, sizeof(pathname));
 	if (!path)
 		path = "(unknown)";
 	printk(KERN_CRIT
-	       "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
-	       inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+	       "EXT4-fs error (device %s): %s:%d: inode #%lu "
+	       "(comm %s path %s): ",
+	       inode->i_sb->s_id, function, line, inode->i_ino,
+	       current->comm, path);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -435,7 +487,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
  * automatically and invokes the appropriate error response.  */
 
-void __ext4_std_error(struct super_block *sb, const char *function, int errno)
+void __ext4_std_error(struct super_block *sb, const char *function,
+		      unsigned int line, int errno)
 {
 	char nbuf[16];
 	const char *errstr;
@@ -448,8 +501,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
 		return;
 
 	errstr = ext4_decode_error(sb, errno, nbuf);
-	printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
-	       sb->s_id, function, errstr);
+	printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+	       sb->s_id, function, line, errstr);
+	save_error_info(sb, function, line);
 
 	ext4_handle_error(sb);
 }
@@ -464,29 +518,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
  * case we take the easy way out and panic immediately.
  */
 
-void ext4_abort(struct super_block *sb, const char *function,
-		const char *fmt, ...)
+void __ext4_abort(struct super_block *sb, const char *function,
+		unsigned int line, const char *fmt, ...)
 {
 	va_list args;
 
+	save_error_info(sb, function, line);
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+	       function, line);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 
+	if ((sb->s_flags & MS_RDONLY) == 0) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		if (EXT4_SB(sb)->s_journal)
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+		save_error_info(sb, function, line);
+	}
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs panic from previous error\n");
-
-	if (sb->s_flags & MS_RDONLY)
-		return;
-
-	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-	sb->s_flags |= MS_RDONLY;
-	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
-	if (EXT4_SB(sb)->s_journal)
-		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
 void ext4_msg (struct super_block * sb, const char *prefix,
@@ -502,38 +556,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
 }
 
 void __ext4_warning(struct super_block *sb, const char *function,
-		  const char *fmt, ...)
+		    unsigned int line, const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
-	       sb->s_id, function);
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
+	       sb->s_id, function, line);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 }
 
-void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-			   const char *function, const char *fmt, ...)
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+			     struct super_block *sb, ext4_group_t grp,
+			     unsigned long ino, ext4_fsblk_t block,
+			     const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
 	va_list args;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
+	es->s_last_error_ino = cpu_to_le32(ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	__save_error_info(sb, function, line);
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+	       sb->s_id, function, line, grp);
+	if (ino)
+		printk("inode %lu: ", ino);
+	if (block)
+		printk("block %llu:", (unsigned long long) block);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 
 	if (test_opt(sb, ERRORS_CONT)) {
-		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 		ext4_commit_super(sb, 0);
 		return;
 	}
+
 	ext4_unlock_group(sb, grp);
 	ext4_handle_error(sb);
 	/*
@@ -646,13 +709,13 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
-	lock_kernel();
 	if (sb->s_dirt)
 		ext4_commit_super(sb, 1);
 
@@ -660,10 +723,10 @@ static void ext4_put_super(struct super_block *sb)
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 		if (err < 0)
-			ext4_abort(sb, __func__,
-				   "Couldn't clean up the journal");
+			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
+	del_timer(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -720,7 +783,6 @@ static void ext4_put_super(struct super_block *sb)
 	 * Now that we are completely done shutting down the
 	 * superblock, we need to actually destroy the kobject.
 	 */
-	unlock_kernel();
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
@@ -813,8 +875,10 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ext4_inode_cachep);
 }
 
-static void ext4_clear_inode(struct inode *inode)
+void ext4_clear_inode(struct inode *inode)
 {
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	if (EXT4_JOURNAL(inode))
@@ -946,14 +1010,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",journal_async_commit");
 	else if (test_opt(sb, JOURNAL_CHECKSUM))
 		seq_puts(seq, ",journal_checksum");
-	if (test_opt(sb, NOBH))
-		seq_puts(seq, ",nobh");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
-	if (!test_opt(sb, DELALLOC))
+	if (!test_opt(sb, DELALLOC) &&
+	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
 		seq_puts(seq, ",nodelalloc");
 
-
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
 	/*
@@ -977,7 +1039,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, NO_AUTO_DA_ALLOC))
 		seq_puts(seq, ",noauto_da_alloc");
 
-	if (test_opt(sb, DISCARD))
+	if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
 		seq_puts(seq, ",discard");
 
 	if (test_opt(sb, NOLOAD))
@@ -986,6 +1048,16 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, DIOREAD_NOLOCK))
 		seq_puts(seq, ",dioread_nolock");
 
+	if (test_opt(sb, BLOCK_VALIDITY) &&
+	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
+		seq_puts(seq, ",block_validity");
+
+	if (!test_opt(sb, INIT_INODE_TABLE))
+		seq_puts(seq, ",noinit_inode_table");
+	else if (sbi->s_li_wait_mult)
+		seq_printf(seq, ",init_inode_table=%u",
+			   (unsigned) sbi->s_li_wait_mult);
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1065,6 +1137,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 				char *path);
+static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -1086,7 +1159,7 @@ static const struct dquot_operations ext4_quota_operations = {
 
 static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
-	.quota_off	= dquot_quota_off,
+	.quota_off	= ext4_quota_off,
 	.quota_sync	= dquot_quota_sync,
 	.get_info	= dquot_get_dqinfo,
 	.set_info	= dquot_set_dqinfo,
@@ -1100,20 +1173,20 @@ static const struct super_operations ext4_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
-	.delete_inode	= ext4_delete_inode,
+	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
 	.sync_fs	= ext4_sync_fs,
 	.freeze_fs	= ext4_freeze,
 	.unfreeze_fs	= ext4_unfreeze,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
-	.clear_inode	= ext4_clear_inode,
 	.show_options	= ext4_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext4_quota_read,
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
@@ -1121,12 +1194,11 @@ static const struct super_operations ext4_nojournal_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
-	.delete_inode	= ext4_delete_inode,
+	.evict_inode	= ext4_evict_inode,
 	.write_super	= ext4_write_super,
 	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
-	.clear_inode	= ext4_clear_inode,
 	.show_options	= ext4_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext4_quota_read,
@@ -1161,6 +1233,7 @@ enum {
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
+	Opt_init_inode_table, Opt_noinit_inode_table,
 };
 
 static const match_table_t tokens = {
@@ -1231,6 +1304,9 @@ static const match_table_t tokens = {
 	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_inode_table, "init_itable=%u"},
+	{Opt_init_inode_table, "init_itable"},
+	{Opt_noinit_inode_table, "noinit_itable"},
 	{Opt_err, NULL},
 };
 
@@ -1624,10 +1700,12 @@ set_qf_format:
 			*n_blocks_count = option;
 			break;
 		case Opt_nobh:
-			set_opt(sbi->s_mount_opt, NOBH);
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated nobh option");
 			break;
 		case Opt_bh:
-			clear_opt(sbi->s_mount_opt, NOBH);
+			ext4_msg(sb, KERN_WARNING,
+				 "Ignoring deprecated bh option");
 			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
@@ -1699,6 +1777,20 @@ set_qf_format:
 		case Opt_dioread_lock:
 			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
 			break;
+		case Opt_init_inode_table:
+			set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = EXT4_DEF_LI_WAIT_MULT;
+			if (option < 0)
+				return 0;
+			sbi->s_li_wait_mult = option;
+			break;
+		case Opt_noinit_inode_table:
+			clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1882,7 +1974,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1891,7 +1984,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	ext4_fsblk_t inode_bitmap;
 	ext4_fsblk_t inode_table;
 	int flexbg_flag = 0;
-	ext4_group_t i;
+	ext4_group_t i, grp = sbi->s_groups_count;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
@@ -1907,6 +2000,10 @@ static int ext4_check_descriptors(struct super_block *sb)
 			last_block = first_block +
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -1944,6 +2041,8 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
 	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2249,6 +2348,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%lu\n",
 			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			 sbi->s_sectors_written_start) >> 1);
@@ -2259,6 +2360,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 {
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 			(unsigned long long)(sbi->s_kbytes_written +
 			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2312,6 +2415,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
@@ -2348,6 +2452,16 @@ static struct attribute *ext4_attrs[] = {
 	NULL,
 };
 
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
+	NULL,
+};
+
 static ssize_t ext4_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
@@ -2376,7 +2490,6 @@ static void ext4_sb_release(struct kobject *kobj)
 	complete(&sbi->s_kobj_unregister);
 }
 
-
 static const struct sysfs_ops ext4_attr_ops = {
 	.show	= ext4_attr_show,
 	.store	= ext4_attr_store,
@@ -2388,6 +2501,17 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+static void ext4_feat_release(struct kobject *kobj)
+{
+	complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_feat_release,
+};
+
 /*
  * Check whether this filesystem can be mounted based on
  * the features present and the RDONLY/RDWR mount requested.
@@ -2431,6 +2555,419 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 	return 1;
 }
 
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+	struct super_block *sb = (struct super_block *) arg;
+	struct ext4_sb_info *sbi;
+	struct ext4_super_block *es;
+
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+
+	if (es->s_error_count)
+		ext4_msg(sb, KERN_NOTICE, "error count: %u",
+			 le32_to_cpu(es->s_error_count));
+	if (es->s_first_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_first_error_time),
+		       (int) sizeof(es->s_first_error_func),
+		       es->s_first_error_func,
+		       le32_to_cpu(es->s_first_error_line));
+		if (es->s_first_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_first_error_ino));
+		if (es->s_first_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_first_error_block));
+		printk("\n");
+	}
+	if (es->s_last_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_last_error_time),
+		       (int) sizeof(es->s_last_error_func),
+		       es->s_last_error_func,
+		       le32_to_cpu(es->s_last_error_line));
+		if (es->s_last_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_last_error_ino));
+		if (es->s_last_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_last_error_block));
+		printk("\n");
+	}
+	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
+
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group == ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = jiffies - timeout;
+			if (elr->lr_sbi->s_li_wait_mult)
+				timeout *= elr->lr_sbi->s_li_wait_mult;
+			else
+				timeout *= 20;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+
+	if (!ext4_li_info)
+		return;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(elr);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	BUG_ON(NULL == eli);
+
+	eli->li_timer.data = (unsigned long)current;
+	eli->li_timer.function = ext4_lazyinode_timeout;
+
+	eli->li_task = current;
+	wake_up(&eli->li_wait_task);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched))
+				ret = ext4_run_li_request(elr);
+
+			if (ret) {
+				ret = 0;
+				ext4_remove_li_request(elr);
+				continue;
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		if (freezing(current))
+			refrigerator();
+
+		if (time_after_eq(jiffies, next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		eli->li_timer.expires = next_wakeup;
+		add_timer(&eli->li_timer);
+		prepare_to_wait(&eli->li_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+		if (time_before(jiffies, next_wakeup))
+			schedule();
+		finish_wait(&eli->li_wait_daemon, &wait);
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	del_timer_sync(&ext4_li_info->li_timer);
+	eli->li_task = NULL;
+	wake_up(&eli->li_wait_task);
+
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	if (list_empty(&ext4_li_info->li_request_list))
+		return;
+
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	struct task_struct *t;
+
+	t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+		ext4_clear_request_list();
+		del_timer_sync(&ext4_li_info->li_timer);
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+
+	wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
+
+	eli->li_task = NULL;
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
+
+	init_waitqueue_head(&eli->li_wait_daemon);
+	init_waitqueue_head(&eli->li_wait_task);
+	init_timer(&eli->li_timer);
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
+
+	return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	unsigned long rnd;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	get_random_bytes(&rnd, sizeof(rnd));
+	elr->lr_next_sched = jiffies + (unsigned long)rnd %
+			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+	return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret;
+
+	if (sbi->s_li_request != NULL)
+		return 0;
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE)) {
+		sbi->s_li_request = NULL;
+		return 0;
+	}
+
+	if (first_not_zeroed == ngroups) {
+		sbi->s_li_request = NULL;
+		return 0;
+	}
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr)
+		return -ENOMEM;
+
+	mutex_lock(&ext4_li_mtx);
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+out:
+	mutex_unlock(&ext4_li_mtx);
+	if (ret)
+		kfree(elr);
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info)
+		return;
+
+	ext4_clear_request_list();
+
+	while (ext4_li_info->li_task) {
+		wake_up(&ext4_li_info->li_wait_daemon);
+		wait_event(ext4_li_info->li_wait_task,
+			   ext4_li_info->li_task == NULL);
+	}
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2448,7 +2985,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root;
 	char *cp;
 	const char *descr;
-	int ret = -EINVAL;
+	int ret = -ENOMEM;
 	int blocksize;
 	unsigned int db_count;
 	unsigned int i;
@@ -2456,16 +2993,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto out_free_orig;
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
 	if (!sbi->s_blockgroup_lock) {
 		kfree(sbi);
-		return -ENOMEM;
+		goto out_free_orig;
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
@@ -2473,15 +3011,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = EXT4_DEF_RESGID;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
-	sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
-						      sectors[1]);
-
-	unlock_kernel();
+	if (sb->s_bdev->bd_part)
+		sbi->s_sectors_written_start =
+			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 
 	/* Cleanup superblock name */
 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
 		*cp = '!';
 
+	ret = -EINVAL;
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2516,6 +3054,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sbi->s_mount_opt, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2546,6 +3085,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, ERRORS_CONT);
 	else
 		set_opt(sbi->s_mount_opt, ERRORS_RO);
+	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+		set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+	if (def_mount_opts & EXT4_DEFM_DISCARD)
+		set_opt(sbi->s_mount_opt, DISCARD);
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2553,15 +3096,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
-	set_opt(sbi->s_mount_opt, BARRIER);
+	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+		set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
-	if (!IS_EXT3_SB(sb))
+	if (!IS_EXT3_SB(sb) &&
+	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
 		set_opt(sbi->s_mount_opt, DELALLOC);
 
+	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+			   &journal_devnum, &journal_ioprio, NULL, 0)) {
+		ext4_msg(sb, KERN_WARNING,
+			 "failed to parse options in superblock: %s",
+			 sbi->s_es->s_mount_opts);
+	}
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
@@ -2706,15 +3257,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * Test whether we have more sectors than will fit in sector_t,
 	 * and whether the max offset is addressable by the page cache.
 	 */
-	if ((ext4_blocks_count(es) >
-	     (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
-	    (ext4_blocks_count(es) >
-	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+	ret = generic_check_addressable(sb->s_blocksize_bits,
+					ext4_blocks_count(es));
+	if (ret) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-		ret = -EFBIG;
 		goto failed_mount;
 	}
 
@@ -2783,7 +3332,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount2;
 		}
 	}
-	if (!ext4_check_descriptors(sb)) {
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
@@ -2912,18 +3461,7 @@ no_journal:
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount_wq;
 	}
-	if (test_opt(sb, NOBH)) {
-		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
-				"its supported only with writeback mode");
-			clear_opt(sbi->s_mount_opt, NOBH);
-		}
-		if (test_opt(sb, DIOREAD_NOLOCK)) {
-			ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
-				"not supported with nobh mode");
-			goto failed_mount_wq;
-		}
-	}
+
 	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3010,11 +3548,15 @@ no_journal:
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
-		ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
 			 err);
 		goto failed_mount4;
 	}
 
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount4;
+
 	sbi->s_kobj.kset = ext4_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3043,9 +3585,15 @@ no_journal:
 		descr = "out journal";
 
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-		"Opts: %s", descr, orig_data);
+		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+
+	init_timer(&sbi->s_err_report);
+	sbi->s_err_report.function = print_daily_error_info;
+	sbi->s_err_report.data = (unsigned long) sb;
+	if (es->s_error_count)
+		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 
-	lock_kernel();
 	kfree(orig_data);
 	return 0;
 
@@ -3092,7 +3640,7 @@ out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	lock_kernel();
+out_free_orig:
 	kfree(orig_data);
 	return ret;
 }
@@ -3110,7 +3658,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
 	else
@@ -3119,7 +3667,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
 	else
 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3327,8 +3875,17 @@ static int ext4_load_journal(struct super_block *sb,
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
 		err = jbd2_journal_wipe(journal, !really_read_only);
-	if (!err)
+	if (!err) {
+		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+		if (save)
+			memcpy(save, ((char *) es) +
+			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
 		err = jbd2_journal_load(journal);
+		if (save)
+			memcpy(((char *) es) + EXT4_S_ERR_START,
+			       save, EXT4_S_ERR_LEN);
+		kfree(save);
+	}
 
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3339,7 +3896,7 @@ static int ext4_load_journal(struct super_block *sb,
 	EXT4_SB(sb)->s_journal = journal;
 	ext4_clear_journal_err(sb, es);
 
-	if (journal_devnum &&
+	if (!really_read_only && journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
 
@@ -3384,13 +3941,20 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	 */
 	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
-	es->s_kbytes_written =
-		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+	if (sb->s_bdev->bd_part)
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
-	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+	else
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+		ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeblocks_counter));
-	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+		es->s_free_inodes_count =
+			cpu_to_le32(percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeinodes_counter));
 	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
@@ -3491,7 +4055,7 @@ int ext4_force_commit(struct super_block *sb)
 
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
-		vfs_check_frozen(sb, SB_FREEZE_WRITE);
+		vfs_check_frozen(sb, SB_FREEZE_TRANS);
 		ret = ext4_journal_force_commit(journal);
 	}
 
@@ -3587,8 +4151,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 
-	lock_kernel();
-
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -3616,7 +4178,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
-		ext4_abort(sb, __func__, "Abort forced by user");
+		ext4_abort(sb, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3711,6 +4273,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			enable_quota = 1;
 		}
 	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
 	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
@@ -3723,7 +4298,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	if (enable_quota)
 		dquot_resume(sb, -1);
 
@@ -3749,7 +4323,6 @@ restore_opts:
 	}
 #endif
 	unlock_super(sb);
-	unlock_kernel();
 	kfree(orig_data);
 	return err;
 }
@@ -3981,6 +4554,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	return err;
 }
 
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+	/* Force all delayed allocation blocks to be allocated */
+	if (test_opt(sb, DELALLOC)) {
+		down_read(&sb->s_umount);
+		sync_filesystem(sb);
+		up_read(&sb->s_umount);
+	}
+
+	return dquot_quota_off(sb, type);
+}
+
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
  * itself serializes the operations (and noone else should touch the files)
@@ -4030,7 +4615,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
-	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -4055,24 +4639,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	bh = ext4_bread(handle, inode, blk, 1, &err);
 	if (!bh)
 		goto out;
-	if (journal_quota) {
-		err = ext4_journal_get_write_access(handle, bh);
-		if (err) {
-			brelse(bh);
-			goto out;
-		}
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		goto out;
 	}
 	lock_buffer(bh);
 	memcpy(bh->b_data+offset, data, len);
 	flush_dcache_page(bh->b_page);
 	unlock_buffer(bh);
-	if (journal_quota)
-		err = ext4_handle_dirty_metadata(handle, NULL, bh);
-	else {
-		/* Always do at least ordered writes for quotas */
-		err = ext4_jbd2_file_inode(handle, inode);
-		mark_buffer_dirty(bh);
-	}
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 out:
 	if (err) {
@@ -4091,17 +4667,17 @@ out:
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -4146,28 +4722,58 @@ static inline void unregister_as_ext3(void) { }
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
-	.get_sb		= ext4_get_sb,
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static int __init init_ext4_fs(void)
+int __init ext4_init_feat_adverts(void)
+{
+	struct ext4_features *ef;
+	int ret = -ENOMEM;
+
+	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+	if (!ef)
+		goto out;
+
+	ef->f_kobj.kset = ext4_kset;
+	init_completion(&ef->f_kobj_unregister);
+	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+				   "features");
+	if (ret) {
+		kfree(ef);
+		goto out;
+	}
+
+	ext4_feat = ef;
+	ret = 0;
+out:
+	return ret;
+}
+
+static int __init ext4_init_fs(void)
 {
 	int err;
 
 	ext4_check_flag_values();
-	err = init_ext4_system_zone();
+	err = ext4_init_pageio();
 	if (err)
 		return err;
+	err = ext4_init_system_zone();
+	if (err)
+		goto out5;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
 		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-	err = init_ext4_mballoc();
+
+	err = ext4_init_feat_adverts();
+
+	err = ext4_init_mballoc();
 	if (err)
 		goto out3;
 
-	err = init_ext4_xattr();
+	err = ext4_init_xattr();
 	if (err)
 		goto out2;
 	err = init_inodecache();
@@ -4178,38 +4784,46 @@ static int __init init_ext4_fs(void)
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
 	return 0;
 out:
 	unregister_as_ext2();
 	unregister_as_ext3();
 	destroy_inodecache();
 out1:
-	exit_ext4_xattr();
+	ext4_exit_xattr();
 out2:
-	exit_ext4_mballoc();
+	ext4_exit_mballoc();
 out3:
+	kfree(ext4_feat);
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out4:
-	exit_ext4_system_zone();
+	ext4_exit_system_zone();
+out5:
+	ext4_exit_pageio();
 	return err;
 }
 
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
 	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
-	exit_ext4_xattr();
-	exit_ext4_mballoc();
+	ext4_exit_xattr();
+	ext4_exit_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
-	exit_ext4_system_zone();
+	ext4_exit_system_zone();
+	ext4_exit_pageio();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
-module_exit(exit_ext4_fs)
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04338009793..fa4b899da4b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
 		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
-		sb->s_dirt = 1;
-		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+		ext4_handle_dirty_super(handle, sb);
 	}
 }
 
@@ -1418,7 +1417,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
 		ea_bdebug(bh, "out of memory");
 		return;
 	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -1490,8 +1489,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -1515,7 +1514,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1589,18 +1588,16 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
-	ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
 	if (!ext4_xattr_cache)
 		return -ENOMEM;
 	return 0;
 }
 
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 	if (ext4_xattr_cache)
 		mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e4390..1ef16520b95 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 
-extern int init_ext4_xattr(void);
-extern void exit_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
+extern void ext4_exit_xattr(void);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
 
-static inline int
-init_ext4_xattr(void)
+static __init inline int
+ext4_init_xattr(void)
 {
 	return 0;
 }
 
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
 
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 27ac2572595..d75a77f85c2 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,7 +306,6 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern int fat_setsize(struct inode *inode, loff_t offset);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a..b47d2c9f4fa 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
 
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
-					nr_clus * sbi->sec_per_clus);
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS, 0);
 
 				first_cl = cluster;
 			}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 990dfae022e..7257752b6d5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -364,18 +364,6 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 	return 0;
 }
 
-int fat_setsize(struct inode *inode, loff_t offset)
-{
-	int error;
-
-	error = simple_setsize(inode, offset);
-	if (error)
-		return error;
-	fat_truncate_blocks(inode, offset);
-
-	return error;
-}
-
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -387,21 +375,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid;
 	int error;
 
-	/*
-	 * Expand the file. Since inode_setattr() updates ->i_size
-	 * before calling the ->truncate(), but FAT needs to fill the
-	 * hole before it. XXX: this is no longer true with new truncate
-	 * sequence.
-	 */
-	if (attr->ia_valid & ATTR_SIZE) {
-		if (attr->ia_size > inode->i_size) {
-			error = fat_cont_expand(inode, attr->ia_size);
-			if (error || attr->ia_valid == ATTR_SIZE)
-				goto out;
-			attr->ia_valid &= ~ATTR_SIZE;
-		}
-	}
-
 	/* Check for setting the inode time. */
 	ia_valid = attr->ia_valid;
 	if (ia_valid & TIMES_SET_FLAGS) {
@@ -417,6 +390,21 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it. XXX: this is no longer true with new truncate
+	 * sequence.
+	 */
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size > inode->i_size) {
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
+		}
+	}
+
 	if (((attr->ia_valid & ATTR_UID) &&
 	     (attr->ia_uid != sbi->options.fs_uid)) ||
 	    ((attr->ia_valid & ATTR_GID) &&
@@ -441,12 +429,11 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		error = fat_setsize(inode, attr->ia_size);
-		if (error)
-			goto out;
+		truncate_setsize(inode, attr->ia_size);
+		fat_truncate_blocks(inode, attr->ia_size);
 	}
 
-	generic_setattr(inode, attr);
+	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 out:
 	return error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7bf45aee56d..ad6998a92c3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
@@ -159,7 +158,7 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
 	int err;
 
 	*pagep = NULL;
-	err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+	err = cont_write_begin(file, mapping, pos, len, flags,
 				pagep, fsdata, fat_get_block,
 				&MSDOS_I(mapping->host)->mmu_private);
 	if (err < 0)
@@ -212,8 +211,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
-				iov, offset, nr_segs, fat_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				 iov, offset, nr_segs, fat_get_block, NULL);
 	if (ret < 0 && (rw & WRITE))
 		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
 
@@ -263,7 +262,7 @@ static const struct address_space_operations fat_aops = {
  *			check if the location is still valid and retry if it
  *			isn't. Otherwise we do changes.
  *		5. Spinlock is used to protect hash/unhash/location check/lookup
- *		6. fat_clear_inode() unhashes the F-d-c entry.
+ *		6. fat_evict_inode() unhashes the F-d-c entry.
  *		7. lookup() and readdir() do igrab() if they find a F-d-c entry
  *			and consider negative result as cache miss.
  */
@@ -448,16 +447,15 @@ out:
 
 EXPORT_SYMBOL_GPL(fat_build_inode);
 
-static void fat_delete_inode(struct inode *inode)
+static void fat_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
-	inode->i_size = 0;
-	fat_truncate_blocks(inode, 0);
-	clear_inode(inode);
-}
-
-static void fat_clear_inode(struct inode *inode)
-{
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		fat_truncate_blocks(inode, 0);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 	fat_cache_inval_inode(inode);
 	fat_detach(inode);
 }
@@ -490,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
@@ -505,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache *fat_inode_cachep;
@@ -674,12 +668,11 @@ static const struct super_operations fat_sops = {
 	.alloc_inode	= fat_alloc_inode,
 	.destroy_inode	= fat_destroy_inode,
 	.write_inode	= fat_write_inode,
-	.delete_inode	= fat_delete_inode,
+	.evict_inode	= fat_evict_inode,
 	.put_super	= fat_put_super,
 	.write_super	= fat_write_super,
 	.sync_fs	= fat_sync_fs,
 	.statfs		= fat_statfs,
-	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
 
 	.show_options	= fat_show_options,
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba..970e682ea75 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,13 +250,12 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
 	int i, err = 0;
 
-	ll_rw_block(SWRITE, nr_bhs, bhs);
+	for (i = 0; i < nr_bhs; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
-		if (buffer_eopnotsupp(bhs[i])) {
-			clear_buffer_eopnotsupp(bhs[i]);
-			err = -EOPNOTSUPP;
-		} else if (!err && !buffer_uptodate(bhs[i]))
+		if (!err && !buffer_uptodate(bhs[i]))
 			err = -EIO;
 	}
 	return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd7..3345aabd1dd 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -662,27 +662,30 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
+	lock_super(sb);
 	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res)
+	if (res) {
+		unlock_super(sb);
 		return res;
+	}
 
 	sb->s_flags |= MS_NOATIME;
 	sb->s_root->d_op = &msdos_dentry_operations;
+	unlock_super(sb);
 	return 0;
 }
 
-static int msdos_get_sb(struct file_system_type *fs_type,
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
 }
 
 static struct file_system_type msdos_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "msdos",
-	.get_sb		= msdos_get_sb,
+	.mount		= msdos_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fba..b936703b892 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1055,30 +1055,33 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int res;
 
+	lock_super(sb);
 	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res)
+	if (res) {
+		unlock_super(sb);
 		return res;
+	}
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
 		sb->s_root->d_op = &vfat_ci_dentry_ops;
 	else
 		sb->s_root->d_op = &vfat_dentry_ops;
 
+	unlock_super(sb);
 	return 0;
 }
 
-static int vfat_get_sb(struct file_system_type *fs_type,
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
-		       void *data, struct vfsmount *mnt)
+		       void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
 }
 
 static struct file_system_type vfat_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "vfat",
-	.get_sb		= vfat_get_sb,
+	.mount		= vfat_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9d175d623aa..ecc8b3954ed 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
  * match the state "is the filp on a fasync list".
  *
  */
-static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
 	struct fasync_struct *fa, **fp;
 	int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 	return result;
 }
 
+struct fasync_struct *fasync_alloc(void)
+{
+	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
+
 /*
- * Add a fasync entry. Return negative on error, positive if
- * added, and zero if did nothing but change an existing one.
+ * NOTE! This can be used only for unused fasync entries:
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+	kmem_cache_free(fasync_cache, new);
+}
+
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
  *
  * NOTE! It is very important that the FASYNC flag always
  * match the state "is the filp on a fasync list".
  */
-static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
 {
-	struct fasync_struct *new, *fa, **fp;
-	int result = 0;
-
-	new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
+        struct fasync_struct *fa, **fp;
 
 	spin_lock(&filp->f_lock);
 	spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
 		spin_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
 		spin_unlock_irq(&fa->fa_lock);
-
-		kmem_cache_free(fasync_cache, new);
 		goto out;
 	}
 
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
 	new->fa_fd = fd;
 	new->fa_next = *fapp;
 	rcu_assign_pointer(*fapp, new);
-	result = 1;
 	filp->f_flags |= FASYNC;
 
 out:
 	spin_unlock(&fasync_lock);
 	spin_unlock(&filp->f_lock);
-	return result;
+	return fa;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+	struct fasync_struct *new;
+
+	new = fasync_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	/*
+	 * fasync_insert_entry() returns the old (update) entry if
+	 * it existed.
+	 *
+	 * So free the (unused) new entry and return 0 to let the
+	 * caller know that we didn't add any new fasync entries.
+	 */
+	if (fasync_insert_entry(fd, filp, fapp, new)) {
+		fasync_free(new);
+		return 0;
+	}
+
+	return 1;
 }
 
 /*
@@ -767,11 +801,26 @@ void kill_fasync(struct fasync_struct **fp, int sig, int band)
 }
 EXPORT_SYMBOL(kill_fasync);
 
-static int __init fasync_init(void)
+static int __init fcntl_init(void)
 {
+	/*
+	 * Please add new bits here to ensure allocation uniqueness.
+	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+	 * is defined as O_NONBLOCK on some platforms and not on others.
+	 */
+	BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+		O_RDONLY	| O_WRONLY	| O_RDWR	|
+		O_CREAT		| O_EXCL	| O_NOCTTY	|
+		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
+		__O_SYNC	| O_DSYNC	| FASYNC	|
+		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
+		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
+		FMODE_EXEC
+		));
+
 	fasync_cache = kmem_cache_create("fasync_cache",
 		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
-module_init(fasync_init)
+module_init(fcntl_init)
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d..4e303c22d5e 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
  */
 const struct file_operations def_fifo_fops = {
 	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
+	.llseek		= noop_llseek,
 };
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d99..0be344755c0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
  */
 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
 
-static inline void * alloc_fdmem(unsigned int size)
+static inline void *alloc_fdmem(unsigned int size)
 {
-	if (size <= PAGE_SIZE)
-		return kmalloc(size, GFP_KERNEL);
-	else
-		return vmalloc(size);
+	void *data;
+
+	data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+	if (data != NULL)
+		return data;
+
+	return vmalloc(size);
 }
 
-static inline void free_fdarr(struct fdtable *fdt)
+static void free_fdmem(void *ptr)
 {
-	if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
-		kfree(fdt->fd);
-	else
-		vfree(fdt->fd);
+	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
 }
 
-static inline void free_fdset(struct fdtable *fdt)
+static void __free_fdtable(struct fdtable *fdt)
 {
-	if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
-		kfree(fdt->open_fds);
-	else
-		vfree(fdt->open_fds);
+	free_fdmem(fdt->fd);
+	free_fdmem(fdt->open_fds);
+	kfree(fdt);
 }
 
 static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
 	spin_unlock_bh(&f->lock);
 	while(fdt) {
 		struct fdtable *next = fdt->next;
-		vfree(fdt->fd);
-		free_fdset(fdt);
-		kfree(fdt);
+
+		__free_fdtable(fdt);
 		fdt = next;
 	}
 }
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
 				container_of(fdt, struct files_struct, fdtab));
 		return;
 	}
-	if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
+	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
 		kfree(fdt->fd);
 		kfree(fdt->open_fds);
 		kfree(fdt);
@@ -178,13 +176,12 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	fdt->open_fds = (fd_set *)data;
 	data += nr / BITS_PER_BYTE;
 	fdt->close_on_exec = (fd_set *)data;
-	INIT_RCU_HEAD(&fdt->rcu);
 	fdt->next = NULL;
 
 	return fdt;
 
 out_arr:
-	free_fdarr(fdt);
+	free_fdmem(fdt->fd);
 out_fdt:
 	kfree(fdt);
 out:
@@ -214,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
 	 * caller and alloc_fdtable().  Cheaper to catch it here...
 	 */
 	if (unlikely(new_fdt->max_fds <= nr)) {
-		free_fdarr(new_fdt);
-		free_fdset(new_fdt);
-		kfree(new_fdt);
+		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
 	/*
@@ -232,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
 			free_fdtable(cur_fdt);
 	} else {
 		/* Somebody else expanded, so undo our attempt */
-		free_fdarr(new_fdt);
-		free_fdset(new_fdt);
-		kfree(new_fdt);
+		__free_fdtable(new_fdt);
 	}
 	return 1;
 }
@@ -312,7 +305,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
 	new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	new_fdt->fd = &newf->fd_array[0];
-	INIT_RCU_HEAD(&new_fdt->rcu);
 	new_fdt->next = NULL;
 
 	spin_lock(&oldf->file_lock);
@@ -325,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	while (unlikely(open_files > new_fdt->max_fds)) {
 		spin_unlock(&oldf->file_lock);
 
-		if (new_fdt != &newf->fdtab) {
-			free_fdarr(new_fdt);
-			free_fdset(new_fdt);
-			kfree(new_fdt);
-		}
+		if (new_fdt != &newf->fdtab)
+			__free_fdtable(new_fdt);
 
 		new_fdt = alloc_fdtable(open_files - 1);
 		if (!new_fdt) {
@@ -339,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 
 		/* beyond sysctl_nr_open; nothing to do */
 		if (unlikely(new_fdt->max_fds < open_files)) {
-			free_fdarr(new_fdt);
-			free_fdset(new_fdt);
-			kfree(new_fdt);
+			__free_fdtable(new_fdt);
 			*errorp = -EMFILE;
 			goto out_release;
 		}
@@ -430,7 +417,6 @@ struct files_struct init_files = {
 		.fd		= &init_files.fd_array[0],
 		.close_on_exec	= (fd_set *)&init_files.close_on_exec_init,
 		.open_fds	= (fd_set *)&init_files.open_fds_init,
-		.rcu		= RCU_HEAD_INIT,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c7d10ead4a..c3dee381f1b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 
 #include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-/* public. Not pretty! */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DECLARE_LGLOCK(files_lglock);
+DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -58,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
  * Return the total number of open files in the system
  */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -66,7 +68,7 @@ static int get_nr_files(void)
 /*
  * Return the maximum number of open files in the system
  */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
 	return files_stat.max_files;
 }
@@ -80,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -103,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
-	static int old_max;
+	static long old_max;
 	struct file * f;
 
 	/*
@@ -138,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
 	/* Ran out of filps - report that */
 	if (get_nr_files() > old_max) {
-		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					get_max_files());
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
 	goto fail;
@@ -249,7 +250,7 @@ static void __fput(struct file *file)
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	file_kill(file);
+	file_sb_list_del(file);
 	if (file->f_mode & FMODE_WRITE)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
@@ -289,11 +290,20 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
 
 /*
- * Lightweight file lookup - no refcnt increment if fd table isn't shared. 
- * You can use this only if it is guranteed that the current task already 
- * holds a refcnt to that file. That check has to be done at fget() only
- * and a flag is returned to be passed to the corresponding fput_light().
- * There must not be a cloning between an fget_light/fput_light pair.
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
+ *
+ * You can use this instead of fget if you satisfy all of the following
+ * conditions:
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
  */
 struct file *fget_light(unsigned int fd, int *fput_needed)
 {
@@ -319,41 +329,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	return file;
 }
 
-
 void put_filp(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
-		file_kill(file);
+		file_sb_list_del(file);
 		file_free(file);
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+#ifdef CONFIG_SMP
+	return file->f_sb_list_cpu;
+#else
+	return smp_processor_id();
+#endif
 }
 
-void file_kill(struct file *file)
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	struct list_head *list;
+#ifdef CONFIG_SMP
+	int cpu;
+	cpu = smp_processor_id();
+	file->f_sb_list_cpu = cpu;
+	list = per_cpu_ptr(sb->s_files, cpu);
+#else
+	list = &sb->s_files;
+#endif
+	list_add(&file->f_u.fu_list, list);
+}
+
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	lg_local_lock(files_lglock);
+	__file_sb_list_add(file, sb);
+	lg_local_unlock(files_lglock);
+}
+
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		file_list_lock();
+		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		file_list_unlock();
+		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
 	}
 }
 
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	int i;							\
+	for_each_possible_cpu(i) {				\
+		struct list_head *list;				\
+		list = per_cpu_ptr((__sb)->s_files, i);		\
+		list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+	}							\
+}
+
+#else
+
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	struct list_head *list;					\
+	list = &(sb)->s_files;					\
+	list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+}
+
+#endif
+
 int fs_may_remount_ro(struct super_block *sb)
 {
 	struct file *file;
-
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, file) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -363,11 +439,11 @@ int fs_may_remount_ro(struct super_block *sb)
 		/* Writeable file? */
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
-	}
-	file_list_unlock();
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lg_global_unlock(files_lglock);
 	return 0;
 }
 
@@ -383,8 +459,8 @@ void mark_files_ro(struct super_block *sb)
 	struct file *f;
 
 retry:
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
 		       continue;
@@ -399,21 +475,18 @@ retry:
 			continue;
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
+		/* This can sleep, so we can't hold the spinlock. */
+		lg_global_unlock(files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
-	}
-	file_list_unlock();
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
 { 
-	int n; 
+	unsigned long n;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -424,9 +497,8 @@ void __init files_init(unsigned long mempages)
 	 */ 
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
-	files_stat.max_files = n; 
-	if (files_stat.max_files < NR_FILE)
-		files_stat.max_files = NR_FILE;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
+	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 50ab5eecb99..881aa3d217f 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void			vxfs_put_fake_inode(struct inode *);
 extern struct vxfs_inode_info *	vxfs_blkiget(struct super_block *, u_long, ino_t);
 extern struct vxfs_inode_info *	vxfs_stiget(struct super_block *, ino_t);
 extern struct inode *		vxfs_iget(struct super_block *, ino_t);
-extern void			vxfs_clear_inode(struct inode *);
+extern void			vxfs_evict_inode(struct inode *);
 
 /* vxfs_lookup.c */
 extern const struct inode_operations	vxfs_dir_inode_ops;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 03a6ea5e99f..8c04eac5079 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
 	struct inode			*ip = NULL;
 
 	if ((ip = new_inode(sbp))) {
+		ip->i_ino = get_next_ino();
 		vxfs_iinit(ip, vip);
 		ip->i_mapping->a_ops = &vxfs_aops;
 	}
@@ -337,15 +338,17 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 }
 
 /**
- * vxfs_clear_inode - remove inode from main memory
+ * vxfs_evict_inode - remove inode from main memory
  * @ip:		inode to discard.
  *
  * Description:
- *  vxfs_clear_inode() is called on the final iput and frees the private
+ *  vxfs_evict_inode() is called on the final iput and frees the private
  *  inode area.
  */
 void
-vxfs_clear_inode(struct inode *ip)
+vxfs_evict_inode(struct inode *ip)
 {
+	truncate_inode_pages(&ip->i_data, 0);
+	end_writeback(ip);
 	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c..6c5131d592f 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
 #include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include "vxfs.h"
 #include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
 	if (dp->d_name.len > VXFS_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 				 
-	lock_kernel();
 	ino = vxfs_inode_by_name(dip, dp);
 	if (ino) {
 		ip = vxfs_iget(dip->i_sb, ino);
-		if (IS_ERR(ip)) {
-			unlock_kernel();
+		if (IS_ERR(ip))
 			return ERR_CAST(ip);
-		}
 	}
-	unlock_kernel();
 	d_add(dp, ip);
 	return NULL;
 }
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 	u_long			page, npages, block, pblocks, nblocks, offset;
 	loff_t			pos;
 
-	lock_kernel();
-
 	switch ((long)fp->f_pos) {
 	case 0:
 		if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 
 	pos = fp->f_pos - 2;
 	
-	if (pos > VXFS_DIRROUND(ip->i_size)) {
-		unlock_kernel();
+	if (pos > VXFS_DIRROUND(ip->i_size))
 		return 0;
-	}
 
 	npages = dir_pages(ip);
 	nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 done:
 	fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 out:
-	unlock_kernel();
 	return 0;
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e..9d1c9955838 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
@@ -61,7 +60,7 @@ static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static const struct super_operations vxfs_super_ops = {
-	.clear_inode =		vxfs_clear_inode,
+	.evict_inode =		vxfs_evict_inode,
 	.put_super =		vxfs_put_super,
 	.statfs =		vxfs_statfs,
 	.remount_fs =		vxfs_remount,
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
 {
 	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
 
-	lock_kernel();
-
 	vxfs_put_fake_inode(infp->vsi_fship);
 	vxfs_put_fake_inode(infp->vsi_ilist);
 	vxfs_put_fake_inode(infp->vsi_stilist);
 
 	brelse(infp->vsi_bp);
 	kfree(infp);
-
-	unlock_kernel();
 }
 
 /**
@@ -135,7 +130,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 /**
- * vxfs_read_super - read superblock into memory and initalize filesystem
+ * vxfs_read_super - read superblock into memory and initialize filesystem
  * @sbp:		VFS superblock (to fill)
  * @dp:			fs private mount data
  * @silent:		do not complain loudly when sth is wrong
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
  *   The superblock on success, else %NULL.
  *
  * Locking:
- *   We are under the bkl and @sbp->s_lock.
+ *   We are under @sbp->s_lock.
  */
 static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 {
@@ -251,17 +246,16 @@ out:
 /*
  * The usual module blurb.
  */
-static int vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
 }
 
 static struct file_system_type vxfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "vxfs",
-	.get_sb		= vxfs_get_sb,
+	.mount		= vxfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac9..3d06ccc953a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,15 +26,9 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/tracepoint.h>
 #include "internal.h"
 
-#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
-
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -50,6 +44,19 @@ struct wb_writeback_work {
 	struct completion *done;	/* set if the caller waits */
 };
 
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure so that the definition remains local to this
+ * file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -59,28 +66,42 @@ struct wb_writeback_work {
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return !list_empty(&bdi->work_list);
+	return test_bit(BDI_writeback_running, &bdi->state);
+}
+
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (strcmp(sb->s_type->name, "bdev") == 0)
+		return inode->i_mapping->backing_dev_info;
+
+	return sb->s_bdi;
+}
+
+static inline struct inode *wb_inode(struct list_head *head)
+{
+	return list_entry(head, struct inode, i_wb_list);
 }
 
 static void bdi_queue_work(struct backing_dev_info *bdi,
 		struct wb_writeback_work *work)
 {
-	spin_lock(&bdi->wb_lock);
-	list_add_tail(&work->list, &bdi->work_list);
-	spin_unlock(&bdi->wb_lock);
+	trace_writeback_queue(bdi, work);
 
-	/*
-	 * If the default thread isn't there, make sure we add it. When
-	 * it gets created and wakes up, we'll run this work.
-	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (bdi->wb.task) {
+		wake_up_process(bdi->wb.task);
+	} else {
+		/*
+		 * The bdi thread isn't there, wake up the forker thread which
+		 * will create and run it.
+		 */
+		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		if (wb->task)
-			wake_up_process(wb->task);
 	}
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
@@ -95,8 +116,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		if (bdi->wb.task)
+		if (bdi->wb.task) {
+			trace_writeback_nowork(bdi);
 			wake_up_process(bdi->wb.task);
+		}
 		return;
 	}
 
@@ -154,11 +177,11 @@ static void redirty_tail(struct inode *inode)
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
-		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+		tail = wb_inode(wb->b_dirty.next);
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &wb->b_dirty);
+	list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 
 /*
@@ -168,7 +191,7 @@ static void requeue_io(struct inode *inode)
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
-	list_move(&inode->i_list, &wb->b_more_io);
+	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -209,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 
 	while (!list_empty(delaying_queue)) {
-		inode = list_entry(delaying_queue->prev, struct inode, i_list);
+		inode = wb_inode(delaying_queue->prev);
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
 		if (sb && sb != inode->i_sb)
 			do_sb_sort = 1;
 		sb = inode->i_sb;
-		list_move(&inode->i_list, &tmp);
+		list_move(&inode->i_wb_list, &tmp);
 	}
 
 	/* just one sb in list, splice to dispatch_queue and we're done */
@@ -227,22 +250,29 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 
 	/* Move inodes from one superblock together */
 	while (!list_empty(&tmp)) {
-		inode = list_entry(tmp.prev, struct inode, i_list);
-		sb = inode->i_sb;
+		sb = wb_inode(tmp.prev)->i_sb;
 		list_for_each_prev_safe(pos, node, &tmp) {
-			inode = list_entry(pos, struct inode, i_list);
+			inode = wb_inode(pos);
 			if (inode->i_sb == sb)
-				list_move(&inode->i_list, dispatch_queue);
+				list_move(&inode->i_wb_list, dispatch_queue);
 		}
 	}
 }
 
 /*
  * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	list_splice_init(&wb->b_more_io, wb->b_io.prev);
+	list_splice_init(&wb->b_more_io, &wb->b_io);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
@@ -352,73 +382,43 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
-	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
-		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
-			/*
-			 * More pages get dirtied by a fast dirtier.
-			 */
-			goto select_queue;
-		} else if (inode->i_state & I_DIRTY) {
-			/*
-			 * At least XFS will redirty the inode during the
-			 * writeback (delalloc) and on io completion (isize).
-			 */
-			redirty_tail(inode);
-		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+	if (!(inode->i_state & I_FREEING)) {
+		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
-			 * sometimes bales out without doing anything. Redirty
-			 * the inode; Move it from b_io onto b_more_io/b_dirty.
+			 * sometimes bales out without doing anything.
 			 */
-			/*
-			 * akpm: if the caller was the kupdate function we put
-			 * this inode at the head of b_dirty so it gets first
-			 * consideration.  Otherwise, move it to the tail, for
-			 * the reasons described there.  I'm not really sure
-			 * how much sense this makes.  Presumably I had a good
-			 * reasons for doing it this way, and I'd rather not
-			 * muck with it at present.
-			 */
-			if (wbc->for_kupdate) {
+			inode->i_state |= I_DIRTY_PAGES;
+			if (wbc->nr_to_write <= 0) {
 				/*
-				 * For the kupdate function we move the inode
-				 * to b_more_io so it will get more writeout as
-				 * soon as the queue becomes uncongested.
+				 * slice used up: queue for next turn
 				 */
-				inode->i_state |= I_DIRTY_PAGES;
-select_queue:
-				if (wbc->nr_to_write <= 0) {
-					/*
-					 * slice used up: queue for next turn
-					 */
-					requeue_io(inode);
-				} else {
-					/*
-					 * somehow blocked: retry later
-					 */
-					redirty_tail(inode);
-				}
+				requeue_io(inode);
 			} else {
 				/*
-				 * Otherwise fully redirty the inode so that
-				 * other inodes on this superblock will get some
-				 * writeout.  Otherwise heavy writing to one
-				 * file would indefinitely suspend writeout of
-				 * all the other files.
+				 * Writeback blocked by something other than
+				 * congestion. Delay the inode for some time to
+				 * avoid spinning on the CPU (100% iowait)
+				 * retrying writeback of the dirty page/inode
+				 * that cannot be performed immediately.
 				 */
-				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
-		} else if (atomic_read(&inode->i_count)) {
+		} else if (inode->i_state & I_DIRTY) {
 			/*
-			 * The inode is clean, inuse
+			 * Filesystems can dirty the inode during writeback
+			 * operations, such as delayed allocation during
+			 * submission or metadata updates after data IO
+			 * completion.
 			 */
-			list_move(&inode->i_list, &inode_in_use);
+			redirty_tail(inode);
 		} else {
 			/*
-			 * The inode is clean, unused
+			 * The inode is clean.  At this point we either have
+			 * a reference to the inode or it's on it's way out.
+			 * No need to add it back to the LRU.
 			 */
-			list_move(&inode->i_list, &inode_unused);
+			list_del_init(&inode->i_wb_list);
 		}
 	}
 	inode_sync_complete(inode);
@@ -466,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
 	while (!list_empty(&wb->b_io)) {
 		long pages_skipped;
-		struct inode *inode = list_entry(wb->b_io.prev,
-						 struct inode, i_list);
+		struct inode *inode = wb_inode(wb->b_io.prev);
 
 		if (inode->i_sb != sb) {
 			if (only_this_sb) {
@@ -488,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			return 0;
 		}
 
-		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+		/*
+		 * Don't bother with new inodes or inodes beeing freed, first
+		 * kind does not need peridic writeout yet, and for the latter
+		 * kind writeout is handled by the freer.
+		 */
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
+
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
@@ -499,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		if (inode_dirtied_after(inode, wbc->wb_start))
 			return 1;
 
-		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
@@ -530,14 +534,14 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 {
 	int ret = 0;
 
-	wbc->wb_start = jiffies; /* livelock avoidance */
+	if (!wbc->wb_start)
+		wbc->wb_start = jiffies; /* livelock avoidance */
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 
 	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = list_entry(wb->b_io.prev,
-						 struct inode, i_list);
+		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
 
 		if (!pin_sb_for_writeback(sb)) {
@@ -559,7 +563,6 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	wbc->wb_start = jiffies; /* livelock avoidance */
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
@@ -580,10 +583,10 @@ static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
 
-	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+	global_dirty_limits(&background_thresh, &dirty_thresh);
 
 	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 
 /*
@@ -625,6 +628,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.range_end = LLONG_MAX;
 	}
 
+	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
@@ -642,10 +646,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc, wb->bdi);
 		if (work->sb)
 			__writeback_inodes_sb(work->sb, wb, &wbc);
 		else
 			writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc, wb->bdi);
+
 		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -671,8 +679,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 */
 		spin_lock(&inode_lock);
 		if (!list_empty(&wb->b_more_io))  {
-			inode = list_entry(wb->b_more_io.prev,
-						struct inode, i_list);
+			inode = wb_inode(wb->b_more_io.prev);
+			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
@@ -685,20 +693,31 @@ static long wb_writeback(struct bdi_writeback *wb,
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
 static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
+get_next_work_item(struct backing_dev_info *bdi)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock(&bdi->wb_lock);
+	spin_lock_bh(&bdi->wb_lock);
 	if (!list_empty(&bdi->work_list)) {
 		work = list_entry(bdi->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock(&bdi->wb_lock);
+	spin_unlock_bh(&bdi->wb_lock);
 	return work;
 }
 
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+	return global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -716,9 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 		return 0;
 
 	wb->last_old_flush = jiffies;
-	nr_pages = global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	nr_pages = get_nr_dirty_pages();
 
 	if (nr_pages) {
 		struct wb_writeback_work work = {
@@ -743,7 +760,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
-	while ((work = get_next_work_item(bdi, wb)) != NULL) {
+	set_bit(BDI_writeback_running, &wb->bdi->state);
+	while ((work = get_next_work_item(bdi)) != NULL) {
 		/*
 		 * Override sync mode, in case we must wait for completion
 		 * because this thread is exiting now.
@@ -751,6 +769,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->sync_mode = WB_SYNC_ALL;
 
+		trace_writeback_exec(bdi, work);
+
 		wrote += wb_writeback(wb, work);
 
 		/*
@@ -767,6 +787,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
 }
@@ -775,47 +796,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * wakes up periodically and does kupdated style flushing.
  */
-int bdi_writeback_task(struct bdi_writeback *wb)
+int bdi_writeback_thread(void *data)
 {
-	unsigned long last_active = jiffies;
-	unsigned long wait_jiffies = -1UL;
+	struct bdi_writeback *wb = data;
+	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
+	current->flags |= PF_SWAPWRITE;
+	set_freezable();
+	wb->last_active = jiffies;
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
+
+	trace_writeback_thread_start(bdi);
+
 	while (!kthread_should_stop()) {
+		/*
+		 * Remove own delayed wake-up timer, since we are already awake
+		 * and we'll take care of the preriodic write-back.
+		 */
+		del_timer(&wb->wakeup_timer);
+
 		pages_written = wb_do_writeback(wb, 0);
 
+		trace_writeback_pages_written(pages_written);
+
 		if (pages_written)
-			last_active = jiffies;
-		else if (wait_jiffies != -1UL) {
-			unsigned long max_idle;
+			wb->last_active = jiffies;
 
-			/*
-			 * Longest period of inactivity that we tolerate. If we
-			 * see dirty data again later, the task will get
-			 * recreated automatically.
-			 */
-			max_idle = max(5UL * 60 * HZ, wait_jiffies);
-			if (time_after(jiffies, max_idle + last_active))
-				break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			continue;
 		}
 
-		if (dirty_writeback_interval) {
-			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout_interruptible(wait_jiffies);
-		} else {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (list_empty_careful(&wb->bdi->work_list) &&
-			    !kthread_should_stop())
-				schedule();
-			__set_current_state(TASK_RUNNING);
+		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		else {
+			/*
+			 * We have nothing to do, so can go sleep without any
+			 * timeout and save power. When a work is queued or
+			 * something is made dirty - we will be woken up.
+			 */
+			schedule();
 		}
 
 		try_to_freeze();
 	}
 
+	/* Flush any work that raced with us exiting */
+	if (!list_empty(&bdi->work_list))
+		wb_do_writeback(wb, 1);
+
+	trace_writeback_thread_stop(bdi);
 	return 0;
 }
 
+
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
@@ -890,6 +930,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = NULL;
+	bool wakeup_bdi = false;
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -932,10 +974,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * dirty list.  Add blockdev inodes as well.
 		 */
 		if (!S_ISBLK(inode->i_mode)) {
-			if (hlist_unhashed(&inode->i_hash))
+			if (inode_unhashed(inode))
 				goto out;
 		}
-		if (inode->i_state & (I_FREEING|I_CLEAR))
+		if (inode->i_state & I_FREEING)
 			goto out;
 
 		/*
@@ -943,22 +985,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
-			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-			struct backing_dev_info *bdi = wb->bdi;
-
-			if (bdi_cap_writeback_dirty(bdi) &&
-			    !test_bit(BDI_registered, &bdi->state)) {
-				WARN_ON(1);
-				printk(KERN_ERR "bdi-%s not registered\n",
-								bdi->name);
+			bdi = inode_to_bdi(inode);
+
+			if (bdi_cap_writeback_dirty(bdi)) {
+				WARN(!test_bit(BDI_registered, &bdi->state),
+				     "bdi-%s not registered\n", bdi->name);
+
+				/*
+				 * If this is the first dirty inode for this
+				 * bdi, we have to wake-up the corresponding
+				 * bdi thread to make sure background
+				 * write-back happens later.
+				 */
+				if (!wb_has_dirty_io(&bdi->wb))
+					wakeup_bdi = true;
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &wb->b_dirty);
+			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 		}
 	}
 out:
 	spin_unlock(&inode_lock);
+
+	if (wakeup_bdi)
+		bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
@@ -1001,7 +1052,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		mapping = inode->i_mapping;
 		if (mapping->nrpages == 0)
@@ -1030,33 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb)
 }
 
 /**
- * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
  * @sb: the superblock
+ * @nr: the number of pages to write
  *
  * Start writeback on some inodes on this super_block. No guarantees are made
  * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO. The number of pages submitted is
- * returned.
+ * for IO completion of submitted IO.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.done		= &done,
+		.nr_pages	= nr,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-	work.nr_pages = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+}
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
@@ -1079,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 
 /**
+ * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+				   unsigned long nr)
+{
+	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
+		writeback_inodes_sb_nr(sb, nr);
+		up_read(&sb->s_umount);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+
+/**
  * sync_inodes_sb	-	sync sb inode pages
  * @sb: the superblock
  *
@@ -1159,3 +1240,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+	struct writeback_control wbc = {
+		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.nr_to_write = 0, /* metadata-only */
+	};
+
+	return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db..ed45a9cf5f3 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 {
 	struct path old_root;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_root = fs->root;
 	fs->root = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	if (old_root.dentry)
 		path_put(&old_root);
 }
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 {
 	struct path old_pwd;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		task_lock(p);
 		fs = p->fs;
 		if (fs) {
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
 				path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 				fs->pwd = *new_root;
 				count++;
 			}
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 		task_unlock(p);
 	} while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
 	if (fs) {
 		int kill;
 		task_lock(tsk);
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		tsk->fs = NULL;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		task_unlock(tsk);
 		if (kill)
 			free_fs_struct(fs);
@@ -104,14 +104,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
-		rwlock_init(&fs->lock);
+		spin_lock_init(&fs->lock);
 		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
+		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
 	}
 	return fs;
 }
@@ -126,10 +121,10 @@ int unshare_fs_struct(void)
 		return -ENOMEM;
 
 	task_lock(current);
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	kill = !--fs->users;
 	current->fs = new_fs;
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	task_unlock(current);
 
 	if (kill)
@@ -148,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
@@ -161,14 +156,14 @@ void daemonize_fs_struct(void)
 
 		task_lock(current);
 
-		write_lock(&init_fs.lock);
+		spin_lock(&init_fs.lock);
 		init_fs.users++;
-		write_unlock(&init_fs.lock);
+		spin_unlock(&init_fs.lock);
 
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		current->fs = &init_fs;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 
 		task_unlock(current);
 		if (kill)
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f..3f6dfa98988 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 
 config FSCACHE
 	tristate "General filesystem local caching manager"
-	select SLOW_WORK
 	help
 	  This option enables a generic filesystem caching manager that can be
 	  used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e..f6aad48d38a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
@@ -313,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
 #define dbgprintk(FMT, ...) \
 	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf, 1, 2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
 
-#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 
 #ifdef __KDEBUG
 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -350,9 +352,9 @@ do {						\
 } while (0)
 
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 
 /*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f0..f9d856773f7 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
 		 "FS-Cache debugging mask");
 
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{
+		.procname	= "operation_max_active",
+		.data		= &fscache_op_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_op_wq,
+	},
+	{}
+};
+
+ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
 
 /*
  * initialise the fs caching module
  */
 static int __init fscache_init(void)
 {
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
 	int ret;
 
-	ret = slow_work_register_user(THIS_MODULE);
-	if (ret < 0)
-		goto error_slow_work;
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	fscache_op_max_active =
+		clamp_val(fscache_object_max_active / 2,
+			  fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+					fscache_op_max_active);
+	if (!fscache_op_wq)
+		goto error_op_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
 
 	ret = fscache_proc_init();
 	if (ret < 0)
 		goto error_proc;
 
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
 	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
 					       sizeof(struct fscache_cookie),
 					       0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
 error_kobj:
 	kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
 	fscache_proc_cleanup();
 error_proc:
-	slow_work_unregister_user(THIS_MODULE);
-error_slow_work:
+	destroy_workqueue(fscache_op_wq);
+error_op_wq:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
 	return ret;
 }
 
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
 
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+#endif
 	fscache_proc_cleanup();
-	slow_work_unregister_user(THIS_MODULE);
+	destroy_workqueue(fscache_op_wq);
+	destroy_workqueue(fscache_object_wq);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c533..ebe29c58138 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
 
 	u8		buf[512];	/* key and aux data buffer */
 };
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		       READS, NOREADS);
 		FILTER(obj->events & obj->event_mask,
 		       EVENTS, NOEVENTS);
-		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
-		       WORK, NOWORK);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
 		   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
 		   obj->events,
 		   obj->flags,
-		   obj->work.flags);
+		   work_busy(&obj->work));
 
 	no_cookie = true;
 	keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ff..b6b897c550a 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 	[FSCACHE_OBJECT_DEAD]		= "DEAD",
 };
 
-static void fscache_object_slow_work_put_ref(struct slow_work *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
-const struct slow_work_ops fscache_object_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_object_slow_work_get_ref,
-	.put_ref	= fscache_object_slow_work_put_ref,
-	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
-
 /*
  * we need to notify the parent when an op completes that we had outstanding
  * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
  * execute an object
  */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 	if (object->events & object->event_mask)
 		fscache_enqueue_object(object);
 	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	fscache_put_object(object);
 }
-
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-					  struct seq_file *m)
-{
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
-	seq_printf(m, "FSC: OBJ%x: %s",
-		   object->debug_id,
-		   fscache_object_states_short[object->state]);
-}
-#endif
+EXPORT_SYMBOL(fscache_object_work_func);
 
 /*
  * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
 	_enter("");
 	ASSERT(object->cookie != NULL);
 	ASSERT(object->cookie->parent != NULL);
-	ASSERT(list_empty(&object->work.link));
 
 	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
 			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
 		object->parent = NULL;
 	}
 
-	/* this just shifts the object release to the slow work processor */
-	fscache_stat(&fscache_n_cop_put_object);
-	object->cache->ops->put_object(object);
-	fscache_stat_d(&fscache_n_cop_put_object);
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
 
 	_leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
  */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
 	int ret;
 
 	fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
  */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
 	fscache_stat(&fscache_n_cop_put_object);
 	object->cache->ops->put_object(object);
 	fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
-	slow_work_enqueue(&object->work);
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
  * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 
 		/* sort onto appropriate lists */
 		fscache_enqueue_object(dep);
-		fscache_stat(&fscache_n_cop_put_object);
-		dep->cache->ops->put_object(dep);
-		fscache_stat_d(&fscache_n_cop_put_object);
+		fscache_put_object(dep);
 
 		if (!list_empty(&object->dependents))
 			cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae4..b9f34eaede0 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_op_enqueue);
 	switch (op->flags & FSCACHE_OP_TYPE) {
-	case FSCACHE_OP_FAST:
-		_debug("queue fast");
+	case FSCACHE_OP_ASYNC:
+		_debug("queue async");
 		atomic_inc(&op->usage);
-		if (!schedule_work(&op->fast_work))
+		if (!queue_work(fscache_op_wq, &op->work))
 			fscache_put_operation(op);
 		break;
-	case FSCACHE_OP_SLOW:
-		_debug("queue slow");
-		slow_work_enqueue(&op->slow_work);
-		break;
 	case FSCACHE_OP_MYTHREAD:
 		_debug("queue for caller's attention");
 		break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
 }
 
 /*
- * allow the slow work item processor to get a ref on an operation
- */
-static int fscache_op_get_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	atomic_inc(&op->usage);
-	return 0;
-}
-
-/*
- * allow the slow work item processor to discard a ref on an operation
- */
-static void fscache_op_put_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	fscache_put_operation(op);
-}
-
-/*
- * execute an operation using the slow thread pool to provide processing context
- * - the caller holds a ref to this object, so we don't need to hold one
+ * execute an operation using fs_op_wq to provide processing context -
+ * the caller holds a ref to this object, so we don't need to hold one
  */
-static void fscache_op_execute(struct slow_work *work)
+void fscache_op_work_func(struct work_struct *work)
 {
 	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
+		container_of(work, struct fscache_operation, work);
 	unsigned long start;
 
 	_enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
 	start = jiffies;
 	op->processor(op);
 	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(op);
 
 	_leave("");
 }
-
-/*
- * describe an operation for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
-		   op->object->debug_id, op->debug_id,
-		   op->name, op->state, op->flags);
-}
-#endif
-
-const struct slow_work_ops fscache_op_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_op_get_ref,
-	.put_ref	= fscache_op_put_ref,
-	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_op_desc,
-#endif
-};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd21..41c441c2058 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 
 page_busy:
 	/* we might want to wait here, but that could deadlock the allocator as
-	 * the slow-work threads writing to the cache may all end up sleeping
+	 * the work threads writing to the cache may all end up sleeping
 	 * on memory allocation */
 	fscache_stat(&fscache_n_store_vmscan_busy);
 	return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, NULL);
-	fscache_operation_init_slow(op, fscache_attr_changed_op);
-	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
 	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
 EXPORT_SYMBOL(__fscache_attr_changed);
 
 /*
- * handle secondary execution given to a retrieval op on behalf of the
- * cache
- */
-static void fscache_retrieval_work(struct work_struct *work)
-{
-	struct fscache_retrieval *op =
-		container_of(work, struct fscache_retrieval, op.fast_work);
-	unsigned long start;
-
-	_enter("{OP%x}", op->op.debug_id);
-
-	start = jiffies;
-	op->op.processor(&op->op);
-	fscache_hist(fscache_ops_histogram, start);
-	fscache_put_operation(&op->op);
-}
-
-/*
  * release a retrieval op reference
  */
 static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
-	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
 	INIT_LIST_HEAD(&op->to_do);
 	fscache_set_op_name(&op->op, "Retr");
 	return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_release_write_op);
-	fscache_operation_init_slow(&op->op, fscache_write_op);
-	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+	fscache_operation_init(&op->op, fscache_write_op,
+			       fscache_release_write_op);
+	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
 	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_stat(&fscache_n_store_ops);
 	fscache_stat(&fscache_n_stores_ok);
 
-	/* the slow work queue now carries its own ref on the object */
+	/* the work queue now carries its own ref on the object */
 	fscache_put_operation(&op->op);
 	_leave(" = 0");
 	return 0;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f..85542a7daf4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 static const struct file_operations fuse_ctl_abort_ops = {
 	.open = nonseekable_open,
 	.write = fuse_conn_abort_write,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_ctl_waiting_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_waiting_read,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_conn_max_background_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_max_background_read,
 	.write = fuse_conn_max_background_write,
+	.llseek = no_llseek,
 };
 
 static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.open = nonseekable_open,
 	.read = fuse_conn_congestion_threshold_read,
 	.write = fuse_conn_congestion_threshold_write,
+	.llseek = no_llseek,
 };
 
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 	if (!inode)
 		return NULL;
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_uid = fc->user_id;
 	inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *raw_data,
-			struct vfsmount *mnt)
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *raw_data)
 {
-	return get_sb_single(fs_type, flags, raw_data,
-				fuse_ctl_fill_super, mnt);
+	return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
 }
 
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
 static struct file_system_type fuse_ctl_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fusectl",
-	.get_sb		= fuse_ctl_get_sb,
+	.mount		= fuse_ctl_mount,
 	.kill_sb	= fuse_ctl_kill_sb,
 };
 
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278b..3e87cce5837 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
 	.unlocked_ioctl		= cuse_file_ioctl,
 	.compat_ioctl		= cuse_file_compat_ioctl,
 	.poll			= fuse_file_poll,
+	.llseek		= noop_llseek,
 };
 
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9424796d663..6e07696308d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
 
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 	}
 }
@@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
@@ -306,8 +306,8 @@ __releases(&fc->lock)
 
 static void wait_answer_interruptible(struct fuse_conn *fc,
 				      struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	if (signal_pending(current))
 		return;
@@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
 	else {
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 		/* acquire extra reference, since request is still needed
 		   after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+					  struct fuse_req *req, u64 unique)
+{
+	int err = -ENODEV;
+
+	req->isreply = 0;
+	req->in.h.unique = unique;
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		err = 0;
+	}
+	spin_unlock(&fc->lock);
+
+	return err;
+}
+
 /*
  * Called under fc->lock
  *
@@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
 		if (!cs->write) {
 			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
 		} else {
-			kunmap_atomic(cs->mapaddr, KM_USER0);
+			kunmap(buf->page);
 			buf->len = PAGE_SIZE - cs->len;
 		}
 		cs->currbuf = NULL;
 		cs->mapaddr = NULL;
 	} else if (cs->mapaddr) {
-		kunmap_atomic(cs->mapaddr, KM_USER0);
+		kunmap(cs->pg);
 		if (cs->write) {
 			flush_dcache_page(cs->pg);
 			set_page_dirty_lock(cs->pg);
@@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 
 			BUG_ON(!cs->nr_segs);
 			cs->currbuf = buf;
-			cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+			cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
 			cs->len = buf->len;
 			cs->buf = cs->mapaddr + buf->offset;
 			cs->pipebufs++;
@@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			buf->len = 0;
 
 			cs->currbuf = buf;
-			cs->mapaddr = kmap_atomic(page, KM_USER0);
+			cs->mapaddr = kmap(page);
 			cs->buf = cs->mapaddr;
 			cs->len = PAGE_SIZE;
 			cs->pipebufs++;
@@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			return err;
 		BUG_ON(err != 1);
 		offset = cs->addr % PAGE_SIZE;
-		cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+		cs->mapaddr = kmap(cs->pg);
 		cs->buf = cs->mapaddr + offset;
 		cs->len = min(PAGE_SIZE - offset, cs->seglen);
 		cs->seglen -= cs->len;
@@ -791,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 	int err;
 	struct page *page = *pagep;
 
-	if (page && zeroing && count < PAGE_SIZE) {
-		void *mapaddr = kmap_atomic(page, KM_USER1);
-		memset(mapaddr, 0, PAGE_SIZE);
-		kunmap_atomic(mapaddr, KM_USER1);
-	}
+	if (page && zeroing && count < PAGE_SIZE)
+		clear_highpage(page);
+
 	while (count) {
 		if (cs->write && cs->pipebufs && page) {
 			return fuse_ref_page(cs, page, offset, count);
@@ -812,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 			}
 		}
 		if (page) {
-			void *mapaddr = kmap_atomic(page, KM_USER1);
+			void *mapaddr = kmap_atomic(page, KM_USER0);
 			void *buf = mapaddr + offset;
 			offset += fuse_copy_do(cs, &buf, &count);
-			kunmap_atomic(mapaddr, KM_USER1);
+			kunmap_atomic(mapaddr, KM_USER0);
 		} else
 			offset += fuse_copy_do(cs, NULL, &count);
 	}
@@ -887,8 +903,8 @@ static int request_pending(struct fuse_conn *fc)
 
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -916,7 +932,7 @@ __acquires(&fc->lock)
  */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
 			       size_t nbytes, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
@@ -1231,6 +1247,194 @@ err:
 	return err;
 }
 
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+			     struct fuse_copy_state *cs)
+{
+	struct fuse_notify_store_out outarg;
+	struct inode *inode;
+	struct address_space *mapping;
+	u64 nodeid;
+	int err;
+	pgoff_t index;
+	unsigned int offset;
+	unsigned int num;
+	loff_t file_size;
+	loff_t end;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto out_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto out_finish;
+
+	err = -EINVAL;
+	if (size - sizeof(outarg) != outarg.size)
+		goto out_finish;
+
+	nodeid = outarg.nodeid;
+
+	down_read(&fc->killsb);
+
+	err = -ENOENT;
+	if (!fc->sb)
+		goto out_up_killsb;
+
+	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		goto out_up_killsb;
+
+	mapping = inode->i_mapping;
+	index = outarg.offset >> PAGE_CACHE_SHIFT;
+	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+	end = outarg.offset + outarg.size;
+	if (end > file_size) {
+		file_size = end;
+		fuse_write_update_size(inode, file_size);
+	}
+
+	num = outarg.size;
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		err = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+					   mapping_gfp_mask(mapping));
+		if (!page)
+			goto out_iput;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		if (!err && offset == 0 && (num != 0 || file_size == end))
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err)
+			goto out_iput;
+
+		num -= this_num;
+		offset = 0;
+		index++;
+	}
+
+	err = 0;
+
+out_iput:
+	iput(inode);
+out_up_killsb:
+	up_read(&fc->killsb);
+out_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	release_pages(req->pages, req->num_pages, 0);
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+			 struct fuse_notify_retrieve_out *outarg)
+{
+	int err;
+	struct address_space *mapping = inode->i_mapping;
+	struct fuse_req *req;
+	pgoff_t index;
+	loff_t file_size;
+	unsigned int num;
+	unsigned int offset;
+	size_t total_len = 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+
+	req->in.h.opcode = FUSE_NOTIFY_REPLY;
+	req->in.h.nodeid = outarg->nodeid;
+	req->in.numargs = 2;
+	req->in.argpages = 1;
+	req->page_offset = offset;
+	req->end = fuse_retrieve_end;
+
+	index = outarg->offset >> PAGE_CACHE_SHIFT;
+	file_size = i_size_read(inode);
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		page = find_get_page(mapping, index);
+		if (!page)
+			break;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		num -= this_num;
+		total_len += this_num;
+	}
+	req->misc.retrieve_in.offset = outarg->offset;
+	req->misc.retrieve_in.size = total_len;
+	req->in.args[0].size = sizeof(req->misc.retrieve_in);
+	req->in.args[0].value = &req->misc.retrieve_in;
+	req->in.args[1].size = total_len;
+
+	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, req);
+
+	return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+				struct fuse_copy_state *cs)
+{
+	struct fuse_notify_retrieve_out outarg;
+	struct inode *inode;
+	int err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg))
+		goto copy_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto copy_finish;
+
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		u64 nodeid = outarg.nodeid;
+
+		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+		if (inode) {
+			err = fuse_retrieve(fc, inode, &outarg);
+			iput(inode);
+		}
+	}
+	up_read(&fc->killsb);
+
+	return err;
+
+copy_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1244,6 +1448,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_INVAL_ENTRY:
 		return fuse_notify_inval_entry(fc, size, cs);
 
+	case FUSE_NOTIFY_STORE:
+		return fuse_notify_store(fc, size, cs);
+
+	case FUSE_NOTIFY_RETRIEVE:
+		return fuse_notify_retrieve(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -1503,8 +1713,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
@@ -1527,8 +1737,8 @@ __acquires(&fc->lock)
  * locked).
  */
 static void end_io_requests(struct fuse_conn *fc)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	while (!list_empty(&fc->io)) {
 		struct fuse_req *req =
@@ -1552,6 +1762,16 @@ __acquires(&fc->lock)
 	}
 }
 
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	fc->max_background = UINT_MAX;
+	flush_bg_queue(fc);
+	end_requests(fc, &fc->pending);
+	end_requests(fc, &fc->processing);
+}
+
 /*
  * Abort all requests.
  *
@@ -1578,8 +1798,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		fc->connected = 0;
 		fc->blocked = 0;
 		end_io_requests(fc);
-		end_requests(fc, &fc->pending);
-		end_requests(fc, &fc->processing);
+		end_queued_requests(fc);
 		wake_up_all(&fc->waitq);
 		wake_up_all(&fc->blocked_waitq);
 		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1594,8 +1813,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
 	if (fc) {
 		spin_lock(&fc->lock);
 		fc->connected = 0;
-		end_requests(fc, &fc->pending);
-		end_requests(fc, &fc->processing);
+		fc->blocked = 0;
+		end_queued_requests(fc);
+		wake_up_all(&fc->blocked_waitq);
 		spin_unlock(&fc->lock);
 		fuse_conn_put(fc);
 	}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3cdc5f78a40..c9627c95482 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
 		   exist.  So if permissions are revoked this won't be
 		   noticed immediately, only after the attribute
 		   timeout has expired */
-	} else if (mask & MAY_ACCESS) {
+	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
@@ -1270,21 +1270,18 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 
-	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-		err = inode_change_ok(inode, attr);
-		if (err)
-			return err;
-	}
+	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+		attr->ia_valid |= ATTR_FORCE;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
 
 	if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
 		return 0;
 
-	if (attr->ia_valid & ATTR_SIZE) {
-		err = inode_newsize_ok(inode, attr->ia_size);
-		if (err)
-			return err;
+	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
-	}
 
 	req = fuse_get_req(fc);
 	if (IS_ERR(req))
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb..c8224587123 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static void fuse_write_update_size(struct inode *inode, loff_t pos)
+void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
@@ -1183,8 +1183,8 @@ __acquires(&fc->lock)
  * Called with fc->lock
  */
 void fuse_flush_writepages(struct inode *inode)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064..57d4a3a0f10 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
 			struct fuse_write_in in;
 			struct fuse_write_out out;
 		} write;
+		struct fuse_notify_retrieve_in retrieve_in;
 		struct fuse_lk_in lk_in;
 	} misc;
 
@@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
+void fuse_write_update_size(struct inode *inode, loff_t pos);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ec14d19ce50..cfce3ad86a9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -122,8 +122,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 	fuse_request_send_noreply(fc, req);
 }
 
-static void fuse_clear_inode(struct inode *inode)
+static void fuse_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (inode->i_sb->s_flags & MS_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
@@ -736,7 +738,7 @@ static const struct export_operations fuse_export_operations = {
 static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
 	.destroy_inode  = fuse_destroy_inode,
-	.clear_inode	= fuse_clear_inode,
+	.evict_inode	= fuse_evict_inode,
 	.drop_inode	= generic_delete_inode,
 	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
@@ -1039,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static int fuse_get_sb(struct file_system_type *fs_type,
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
-		       void *raw_data, struct vfsmount *mnt)
+		       void *raw_data)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
 }
 
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1063,17 +1065,16 @@ static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
 	.fs_flags	= FS_HAS_SUBTYPE,
-	.get_sb		= fuse_get_sb,
+	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
 
 #ifdef CONFIG_BLOCK
-static int fuse_get_sb_blk(struct file_system_type *fs_type,
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
 			   int flags, const char *dev_name,
-			   void *raw_data, struct vfsmount *mnt)
+			   void *raw_data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
 }
 
 static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1092,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
-	.get_sb		= fuse_get_sb_blk,
+	.mount		= fuse_mount_blk,
 	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e56415..6bc9e3a5a69 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
 			if (error < 0)
 				goto failed;
 			inode->i_mode = mode;
+			inode->i_ctime = CURRENT_TIME;
 			if (error == 0) {
 				posix_acl_release(acl);
 				acl = NULL;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b4310711..c465ae066c6 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,13 +1,12 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBDAF)
+	depends on (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
 	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
-	select SLOW_WORK
 	select QUOTACTL
 	help
 	  A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9f8b52500d6..4f36f8832b9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
 #include "glops.h"
 
 
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int to)
 {
 	struct buffer_head *head = page_buffers(page);
 	unsigned int bsize = head->b_size;
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
 	if (ret <= 0)
 		return ret;
 
-	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
-	if (ret == -EAGAIN)
-		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-	return ret;
+	return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
 }
 
 /**
@@ -618,10 +615,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
-	struct gfs2_alloc *al;
+	struct gfs2_alloc *al = NULL;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned to = from + len;
 	struct page *page;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -637,9 +633,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		}
 	}
 
-	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
-	if (error)
-		goto out_unlock;
+	alloc_required = gfs2_write_alloc_required(ip, pos, len);
 
 	if (alloc_required || gfs2_is_jdata(ip))
 		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
@@ -668,6 +662,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		rblocks += RES_STATFS + RES_QUOTA;
 	if (&ip->i_inode == sdp->sd_rindex)
 		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(al);
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -694,20 +690,18 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 prepare_write:
-	error = block_prepare_write(page, from, to, gfs2_block_map);
+	error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
 	if (error == 0)
 		return 0;
 
 	page_cache_release(page);
 
-	/*
-	 * XXX(hch): the call below should probably be replaced with
-	 * a call to the gfs2-specific truncate blocks helper to actually
-	 * release disk blocks..
-	 */
+	gfs2_trans_end(sdp);
 	if (pos + len > ip->i_inode.i_size)
-		simple_setsize(&ip->i_inode, ip->i_inode.i_size);
+		gfs2_trim_blocks(&ip->i_inode);
+	goto out_trans_fail;
+
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
@@ -807,10 +801,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	page_cache_release(page);
 
 	if (copied) {
-		if (inode->i_size < to) {
+		if (inode->i_size < to)
 			i_size_write(inode, to);
-			ip->i_disksize = inode->i_size;
-		}
 		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
@@ -881,8 +873,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (ret > 0) {
-		if (inode->i_size > ip->i_disksize)
-			ip->i_disksize = inode->i_size;
 		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
@@ -1047,9 +1037,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	if (rv != 1)
 		goto out; /* dio not valid, fall back to buffered i/o */
 
-	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-					   iov, offset, nr_segs,
-					   gfs2_get_block_direct, NULL);
+	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, gfs2_get_block_direct,
+				  NULL, NULL, 0);
 out:
 	gfs2_glock_dq_m(1, &gh);
 	gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 84da64b551b..5476c066d4e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
  * @ip: the inode
  * @dibh: the dinode buffer
  * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
  *
  * Returns: errno
  */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 /**
  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
  * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
  *
  * This routine unstuffs a dinode and returns it to a "normal" state such
  * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	if (ip->i_disksize) {
+	if (i_size_read(&ip->i_inode)) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
 }
 
 /**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al;
-	struct buffer_head *dibh;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_lock_check(ip);
-	if (error)
-		goto out;
-
-	al->al_requested = sdp->sd_max_height + RES_DATA;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(sdp,
-			sdp->sd_max_height + al->al_rgd->rd_length +
-			RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_end_trans;
-
-	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-		if (gfs2_is_stuffed(ip)) {
-			error = gfs2_unstuff_dinode(ip, NULL);
-			if (error)
-				goto out_brelse;
-		}
-	}
-
-	ip->i_disksize = size;
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
-	brelse(dibh);
-out_end_trans:
-	gfs2_trans_end(sdp);
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-
-/**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
  */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	loff_t from = inode->i_size;
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
 	return err;
 }
 
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
 	struct buffer_head *dibh;
 	int journaled = gfs2_is_jdata(ip);
 	int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 	if (error)
 		goto out;
 
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
 	if (gfs2_is_stuffed(ip)) {
-		u64 dsize = size + sizeof(struct gfs2_inode);
-		ip->i_disksize = size;
-		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		if (dsize > dibh->b_size)
-			dsize = dibh->b_size;
-		gfs2_buffer_clear_tail(dibh, dsize);
-		error = 1;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
 	} else {
-		if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
-			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
-		if (!error) {
-			ip->i_disksize = size;
-			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-			ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			gfs2_dinode_out(ip, dibh->b_data);
+		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+			error = gfs2_block_truncate_page(mapping, newsize);
+			if (error)
+				goto out_brelse;
 		}
+		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 	}
 
-	brelse(dibh);
+	i_size_write(inode, newsize);
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_dinode_out(ip, dibh->b_data);
 
+	truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+	brelse(dibh);
 out:
 	gfs2_trans_end(sdp);
 	return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	if (!ip->i_disksize) {
+	if (!i_size_read(&ip->i_inode)) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
 
 /**
  * do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
  *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
  *
  * Returns: errno
  */
 
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 
-	error = trunc_start(ip, size);
+	error = trunc_start(inode, oldsize, newsize);
 	if (error < 0)
 		return error;
-	if (error > 0)
+	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	error = trunc_dealloc(ip, size);
-	if (!error)
+	error = trunc_dealloc(ip, newsize);
+	if (error == 0)
 		error = trunc_end(ip);
 
 	return error;
 }
 
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 size = inode->i_size;
+	int ret;
+
+	ret = do_shrink(inode, size, size);
+	WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
+	struct gfs2_alloc *al = NULL;
 	int error;
 
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (gfs2_is_stuffed(ip) &&
+	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+		al = gfs2_alloc_get(ip);
+		if (al == NULL)
+			return -ENOMEM;
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto do_grow_alloc_put;
+
+		al->al_requested = 1;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto do_grow_qunlock;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
 	if (error)
-		return error;
+		goto do_grow_release;
 
-	down_write(&ip->i_rw_mutex);
+	if (al) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto do_end_trans;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
-		goto do_touch_out;
+		goto do_end_trans;
 
+	i_size_write(inode, size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
-do_touch_out:
-	up_write(&ip->i_rw_mutex);
+do_end_trans:
 	gfs2_trans_end(sdp);
+do_grow_release:
+	if (al) {
+		gfs2_inplace_release(ip);
+do_grow_qunlock:
+		gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+		gfs2_alloc_put(ip);
+	}
 	return error;
 }
 
 /**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
  *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
  *
  * Returns: errno
  */
 
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
-	int error;
+	int ret;
+	u64 oldsize;
 
-	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
-		return -EINVAL;
+	BUG_ON(!S_ISREG(inode->i_mode));
 
-	if (size > ip->i_disksize)
-		error = do_grow(ip, size);
-	else if (size < ip->i_disksize)
-		error = do_shrink(ip, size);
-	else
-		/* update time stamps */
-		error = do_touch(ip, size);
+	ret = inode_newsize_ok(inode, newsize);
+	if (ret)
+		return ret;
 
-	return error;
+	oldsize = inode->i_size;
+	if (newsize >= oldsize)
+		return do_grow(inode, newsize);
+
+	return do_shrink(inode, oldsize, newsize);
 }
 
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, ip->i_disksize);
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1244,13 +1233,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
  * @ip: the file being written to
  * @offset: the offset to write to
  * @len: the number of bytes being written
- * @alloc_required: set to 1 if an alloc is required, 0 otherwise
  *
- * Returns: errno
+ * Returns: 1 if an alloc is required, 0 otherwise
  */
 
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-			      unsigned int len, int *alloc_required)
+			      unsigned int len)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head bh;
@@ -1258,26 +1246,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 	u64 lblock, lblock_stop, size;
 	u64 end_of_file;
 
-	*alloc_required = 0;
-
 	if (!len)
 		return 0;
 
 	if (gfs2_is_stuffed(ip)) {
 		if (offset + len >
 		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
-			*alloc_required = 1;
+			return 1;
 		return 0;
 	}
 
-	*alloc_required = 1;
 	shift = sdp->sd_sb.sb_bsize_shift;
 	BUG_ON(gfs2_is_dir(ip));
-	end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
 	lblock = offset >> shift;
 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
 	if (lblock_stop > end_of_file)
-		return 0;
+		return 1;
 
 	size = (lblock_stop - lblock) << shift;
 	do {
@@ -1285,12 +1270,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		bh.b_size = size;
 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
 		if (!buffer_mapped(&bh))
-			return 0;
+			return 1;
 		size -= bh.b_size;
 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
 	} while(size > 0);
 
-	*alloc_required = 0;
 	return 0;
 }
 
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05a..42fea03e2bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 	}
 }
 
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-			      unsigned int len, int *alloc_required);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+			   u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+				     unsigned int len);
 
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d..6798755b385 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 		ip = GFS2_I(inode);
 	}
 
-	if (sdp->sd_args.ar_localcaching)
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
 		goto valid;
 
 	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6b48d7c268b..5c356d09c32 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
 typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
 			    u64 leaf_no, void *data);
 typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-	if (ip->i_disksize < offset + size)
-		ip->i_disksize = offset + size;
+	if (ip->i_inode.i_size < offset + size)
+		i_size_write(&ip->i_inode, offset + size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -225,8 +228,8 @@ out:
 	if (error)
 		return error;
 
-	if (ip->i_disksize < offset + copied)
-		ip->i_disksize = offset + copied;
+	if (ip->i_inode.i_size < offset + copied)
+		i_size_write(&ip->i_inode, offset + copied);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	unsigned int o;
 	int copied = 0;
 	int error = 0;
+	u64 disksize = i_size_read(&ip->i_inode);
 
-	if (offset >= ip->i_disksize)
+	if (offset >= disksize)
 		return 0;
 
-	if (offset + size > ip->i_disksize)
-		size = ip->i_disksize - offset;
+	if (offset + size > disksize)
+		size = disksize - offset;
 
 	if (!size)
 		return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
-		if (hsize * sizeof(u64) != ip->i_disksize) {
+		if (hsize * sizeof(u64) != i_size_read(inode)) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
 		}
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
 
-	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+	i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
 	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
@@ -955,7 +959,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	/* Change the pointers.
 	   Don't bother distinguishing stuffed from non-stuffed.
 	   This code is complicated enough already. */
-	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
+	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+	if (!lp) {
+		error = -ENOMEM;
+		goto fail_brelse;
+	}
+
 	/*  Change the pointers  */
 	for (x = 0; x < half_len; x++)
 		lp[x] = cpu_to_be64(bn);
@@ -1052,20 +1061,23 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	u64 *buf;
 	u64 *from, *to;
 	u64 block;
+	u64 disksize = i_size_read(&dip->i_inode);
 	int x;
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
 
 	/*  Allocate both the "from" and "to" buffers in one big chunk  */
 
-	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
+	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
 
-	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+	for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
 					    block * sdp->sd_hash_bsize,
 					    sdp->sd_hash_bsize, 1);
@@ -1363,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1777,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_disksize) {
+	if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3b..a98f644bd3d 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
 
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-		   const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-		 const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-		   const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+				     const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+			  const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+			const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+			 filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+			  const struct gfs2_inode *nip, unsigned int new_type);
 
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
-int gfs2_diradd_alloc_required(struct inode *dir,
-			       const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-			    struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+				      const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+				   struct buffer_head **bhp);
 
 static inline u32 gfs2_disk_hash(const char *data, int len)
 {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
 	memcpy(dent + 1, name->name, name->len);
 }
 
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
 #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8a..06d582732d3 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
-	struct qstr dotdot;
 	struct dentry *dentry;
 
-	/*
-	 * XXX(hch): it would be a good idea to keep this around as a
-	 *	     static variable.
-	 */
-	gfs2_str2qstr(&dotdot, "..");
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ed9a94f0ef1..aa996471ec5 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned long last_index;
 	u64 pos = page->index << PAGE_CACHE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
-	int alloc_required = 0;
 	struct gfs2_holder gh;
 	struct gfs2_alloc *al;
 	int ret;
@@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
-	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
-	if (ret || !alloc_required)
+	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
 		goto out_unlock;
 	ret = -ENOMEM;
 	al = gfs2_alloc_get(ip);
@@ -384,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	rblocks = RES_DINODE + ind_blocks;
 	if (gfs2_is_jdata(ip))
 		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
+	if (ind_blocks || data_blocks) {
 		rblocks += RES_STATFS + RES_QUOTA;
+		rblocks += gfs2_rg_blocks(al);
+	}
 	ret = gfs2_trans_begin(sdp, rblocks, 0);
 	if (ret)
 		goto out_trans_fail;
@@ -493,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail;
 
 		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_disksize > MAX_NON_LFS) {
+		    i_size_read(inode) > MAX_NON_LFS) {
 			error = -EOVERFLOW;
 			goto fail_gunlock;
 		}
@@ -622,6 +622,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  * cluster; until we do, disable leases (by just returning -EINVAL),
  * unless the administrator has requested purely local locking.
  *
+ * Locking: called under lock_flocks
+ *
  * Returns: errno
  */
 
@@ -773,6 +775,7 @@ const struct file_operations gfs2_dir_fops = {
 	.fsync		= gfs2_fsync,
 	.lock		= gfs2_lock,
 	.flock		= gfs2_flock,
+	.llseek		= default_llseek,
 };
 
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -799,5 +802,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
 	.open		= gfs2_open,
 	.release	= gfs2_close,
 	.fsync		= gfs2_fsync,
+	.llseek		= default_llseek,
 };
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0898f3ec821..87778857f09 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 }
 
 /**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+	struct gfs2_holder *gh, *tmp;
+
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (ret & LM_OUT_ERROR)
+			gh->gh_error = -EIO;
+		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			gh->gh_error = GLR_TRYFAILED;
+		else
+			continue;
+		list_del_init(&gh->gh_list);
+		trace_gfs2_glock_queue(gh, 0);
+		gfs2_holder_wake(gh);
+	}
+}
+
+/**
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
  * 
@@ -375,36 +399,13 @@ restart:
 		}
 		if (gh->gh_list.prev == &gl->gl_holders)
 			return 1;
+		do_error(gl, 0);
 		break;
 	}
 	return 0;
 }
 
 /**
- * do_error - Something unexpected has happened during a lock request
- *
- */
-
-static inline void do_error(struct gfs2_glock *gl, const int ret)
-{
-	struct gfs2_holder *gh, *tmp;
-
-	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-			continue;
-		if (ret & LM_OUT_ERROR)
-			gh->gh_error = -EIO;
-		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-			gh->gh_error = GLR_TRYFAILED;
-		else
-			continue;
-		list_del_init(&gh->gh_list);
-		trace_gfs2_glock_queue(gh, 0);
-		gfs2_holder_wake(gh);
-	}
-}
-
-/**
  * find_first_waiter - find the first gh that's waiting for the glock
  * @gl: the glock
  */
@@ -440,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 		else
 			gfs2_glock_put_nolock(gl);
 	}
+	if (held1 && held2 && list_empty(&gl->gl_holders))
+		clear_bit(GLF_QUEUED, &gl->gl_flags);
 
 	gl->gl_state = new_state;
 	gl->gl_tchange = jiffies;
@@ -706,18 +709,8 @@ static void glock_work_func(struct work_struct *work)
 {
 	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
-	struct gfs2_holder *gh;
 	int drop_ref = 0;
 
-	if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
-		spin_lock(&gl->gl_spin);
-		gh = find_first_waiter(gl);
-		if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
-		    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
-			set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-	}
-
 	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
 		finish_xmote(gl, gl->gl_reply);
 		drop_ref = 1;
@@ -1021,6 +1014,7 @@ fail:
 		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
 			insert_pt = &gh2->gh_list;
 	}
+	set_bit(GLF_QUEUED, &gl->gl_flags);
 	if (likely(insert_pt == NULL)) {
 		list_add_tail(&gh->gh_list, &gl->gl_holders);
 		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1072,6 +1066,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 
 	spin_lock(&gl->gl_spin);
 	add_to_queue(gh);
+	if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+	    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 	run_queue(gl, 1);
 	spin_unlock(&gl->gl_spin);
 
@@ -1316,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 
 	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-	if (time_before(now, holdtime))
-		delay = holdtime - now;
-	if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-		delay = gl->gl_ops->go_min_hold_time;
+	if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+			delay = gl->gl_ops->go_min_hold_time;
+	}
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, delay);
@@ -1329,6 +1328,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 }
 
 /**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+	const struct gfs2_holder *gh;
+
+	if (gl->gl_reply & ~LM_OUT_ST_MASK)
+		return 0;
+	if (gl->gl_target == LM_ST_UNLOCKED)
+		return 0;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (LM_FLAG_NOEXP & gh->gh_flags)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
  * gfs2_glock_complete - Callback used by locking
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
@@ -1338,18 +1367,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
 	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+
 	gl->gl_reply = ret;
+
 	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-		struct gfs2_holder *gh;
 		spin_lock(&gl->gl_spin);
-		gh = find_first_waiter(gl);
-		if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
-		     (gl->gl_target != LM_ST_UNLOCKED)) ||
-		    ((ret & ~LM_OUT_ST_MASK) != 0))
+		if (gfs2_should_freeze(gl)) {
 			set_bit(GLF_FROZEN, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-		if (test_bit(GLF_FROZEN, &gl->gl_flags))
+			spin_unlock(&gl->gl_spin);
 			return;
+		}
+		spin_unlock(&gl->gl_spin);
 	}
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 	gfs2_glock_hold(gl);
@@ -1489,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
 	spin_unlock(&lru_lock);
 
 	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+	if (gl->gl_state != LM_ST_UNLOCKED)
 		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
@@ -1637,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'I';
 	if (test_bit(GLF_FROZEN, gflags))
 		*p++ = 'F';
+	if (test_bit(GLF_QUEUED, gflags))
+		*p++ = 'q';
 	*p = 0;
 	return buf;
 }
@@ -1753,10 +1783,12 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	glock_workqueue = create_workqueue("glock_workqueue");
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+						WQ_FREEZEABLE, 0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
 		return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b15..db1c26d6d22 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
  * @gl: the glock
  * @state: the state we're requesting
  * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb69..0d149dcc04e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
-		  (unsigned long long)ip->i_inode.i_size,
-		  (unsigned long long)ip->i_disksize);
+		  (unsigned long long)i_size_read(&ip->i_inode));
 	return 0;
 }
 
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
 	[LM_TYPE_META] = &gfs2_meta_glops,
 	[LM_TYPE_INODE] = &gfs2_inode_glops,
 	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-	[LM_TYPE_NONDISK] = &gfs2_trans_glops,
 	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
 	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
 	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d7363b22d..764fbb49efc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
-#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
 
@@ -197,6 +196,7 @@ enum {
 	GLF_REPLY_PENDING		= 9,
 	GLF_INITIAL			= 10,
 	GLF_FROZEN			= 11,
+	GLF_QUEUED			= 12,
 };
 
 struct gfs2_glock {
@@ -268,7 +268,6 @@ struct gfs2_inode {
 	u64 i_no_formal_ino;
 	u64 i_generation;
 	u64 i_eattr;
-	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
 	struct list_head jd_list;
 	struct list_head extent_list;
-	struct slow_work jd_work;
+	struct work_struct jd_work;
 	struct inode *jd_inode;
 	unsigned long jd_flags;
 #define JDF_RECOVERY 1
@@ -417,11 +416,8 @@ struct gfs2_args {
 	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
 	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
 	unsigned int ar_spectator:1;		/* Don't get a journal */
-	unsigned int ar_ignore_local_fs:1;	/* Ignore optimisations */
 	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
-	unsigned int ar_localcaching:1;		/* Local caching */
 	unsigned int ar_debug:1;		/* Oops on errors */
-	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
 	unsigned int ar_posix_acl:1;		/* Enable posix acls */
 	unsigned int ar_quota:2;		/* off/account/on */
 	unsigned int ar_suiddir:1;		/* suiddir support */
@@ -460,6 +456,7 @@ enum {
 	SDF_NOBARRIERS		= 3,
 	SDF_NORECOVERY		= 4,
 	SDF_DEMOTE		= 5,
+	SDF_NOJOURNALID		= 6,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
  */
 
 struct lm_lockstruct {
-	unsigned int ls_jid;
+	int ls_jid;
 	unsigned int ls_first;
 	unsigned int ls_first_done;
 	unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
 	struct list_head sd_rindex_mru_list;
 	struct gfs2_rgrpd *sd_rindex_forward;
 	unsigned int sd_rgrps;
+	unsigned int sd_max_rg_data;
 
 	/* Journal index stuff */
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f03afd9c44b..06370f8bd8c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_skip_data *data = opaque;
 
 	if (ip->i_no_addr == data->no_addr) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
 		}
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	ip->i_disksize = be64_to_cpu(str->di_size);
-	i_size_write(&ip->i_inode, ip->i_disksize);
+	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -991,18 +990,29 @@ fail:
 
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
+	struct inode *inode = &ip->i_inode;
 	struct buffer_head *dibh;
 	int error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
-	return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	gfs2_assert_warn(GFS2_SB(inode), !error);
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	return 0;
 }
 
 /**
@@ -1044,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(ip->i_disksize);
+	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1074,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  i_disksize = %llu\n",
-	       (unsigned long long)ip->i_disksize);
+	printk(KERN_INFO "  i_size = %llu\n",
+	       (unsigned long long)i_size_read(&ip->i_inode));
 	printk(KERN_INFO "  blocks = %llu\n",
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21d..6720d7d5fbc 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
 			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to);
 extern void gfs2_set_aops(struct inode *inode);
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 	dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
 }
 
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+						u64 minsize, u64 maxsize)
+{
+	u64 size = i_size_read(inode);
+	if (size < minsize || size > maxsize)
+		goto err;
+	if (size & ((1 << inode->i_blkbits) - 1))
+		goto err;
+	return 0;
+err:
+	gfs2_consist_inode(GFS2_I(inode));
+	return -EIO;
+}
 
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c..1c09425b45f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
 		ret |= LM_OUT_CANCELED;
 		goto out;
 	case -EAGAIN: /* Try lock fails */
+	case -EDEADLK: /* Deadlock detected */
 		goto out;
-	case -EINVAL: /* Invalid */
-	case -ENOMEM: /* Out of memory */
+	case -ETIMEDOUT: /* Canceled due to timeout */
 		ret |= LM_OUT_ERROR;
 		goto out;
 	case 0: /* Success */
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6a857e24f94..eb01f3575e1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	lh->lh_hash = cpu_to_be32(hash);
 
 	bh->b_end_io = end_buffer_write_sync;
-	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		submit_bh(WRITE_SYNC | REQ_META, bh);
+	else
+		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		set_buffer_uptodate(bh);
-		fs_info(sdp, "barrier sync failed - disabling barriers\n");
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-		lock_buffer(bh);
-skip_barrier:
-		get_bh(bh);
-		submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
-		wait_on_buffer(bh);
-	}
+
 	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
@@ -932,7 +923,7 @@ int gfs2_logd(void *data)
 
 		do {
 			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
-					TASK_UNINTERRUPTIBLE);
+					TASK_INTERRUPTIBLE);
 			if (!gfs2_ail_flush_reqd(sdp) &&
 			    !gfs2_jrnl_flush_reqd(sdp) &&
 			    !kthread_should_stop())
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index fb2a5f93b7c..ebef7ab6e17 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
-#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -24,6 +23,8 @@
 #include "util.h"
 #include "glock.h"
 #include "quota.h"
+#include "recovery.h"
+#include "dir.h"
 
 static struct shrinker qd_shrinker = {
 	.shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
 {
 	int error;
 
+	gfs2_str2qstr(&gfs2_qdot, ".");
+	gfs2_str2qstr(&gfs2_qdotdot, "..");
+
 	error = gfs2_sys_init();
 	if (error)
 		return error;
@@ -138,9 +142,11 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
-	error = slow_work_register_user(THIS_MODULE);
-	if (error)
-		goto fail_slow;
+	error = -ENOMEM;
+	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
+					  WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+	if (!gfs_recovery_wq)
+		goto fail_wq;
 
 	gfs2_register_debugfs();
 
@@ -148,7 +154,7 @@ static int __init init_gfs2_fs(void)
 
 	return 0;
 
-fail_slow:
+fail_wq:
 	unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +196,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
-	slow_work_unregister_user(THIS_MODULE);
+	destroy_workqueue(gfs_recovery_wq);
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 18176d0b75d..939739c7b3f 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
-	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE));
+	int write_op = REQ_META |
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 		 * activity, but those code paths have their own higher-level
 		 * throttling.
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	}
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+	submit_bh(READ_SYNC | REQ_META, bh);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
+		ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
 
 	dblock++;
 	extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3593b3a7290..3eb1393f7b8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/slow-work.h>
 #include <linux/quotaops.h>
 
 #include "gfs2.h"
@@ -39,14 +38,6 @@
 #define DO 0
 #define UNDO 1
 
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
 /**
  * gfs2_tune_init - Fill a gfs2_tune structure with default values
  * @gt: tune
@@ -76,7 +67,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	sb->s_fs_info = sdp;
 	sdp->sd_vfs = sb;
-
+	set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
 	gfs2_tune_init(&sdp->sd_tune);
 
 	init_waitqueue_head(&sdp->sd_glock_wait);
@@ -136,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-	unsigned int x;
-
 	if (sb->sb_magic != GFS2_MAGIC ||
 	    sb->sb_type != GFS2_METATYPE_SB) {
 		if (!silent)
@@ -151,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
 	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
 		return 0;
 
-	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-		for (x = 0; gfs2_old_fs_formats[x]; x++)
-			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-				break;
-
-		if (!gfs2_old_fs_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-		for (x = 0; gfs2_old_multihost_formats[x]; x++)
-			if (gfs2_old_multihost_formats[x] ==
-			    sb->sb_multihost_format)
-				break;
-
-		if (!gfs2_old_multihost_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (!sdp->sd_args.ar_upgrade) {
-		printk(KERN_WARNING
-		       "GFS2: code version (%u, %u) is incompatible "
-		       "with ondisk format (%u, %u)\n",
-		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-		       sb->sb_fs_format, sb->sb_multihost_format);
-		printk(KERN_INFO
-		       "GFS2: Use the \"upgrade\" mount option to upgrade "
-		       "the FS\n");
-		printk(KERN_INFO "GFS2: See the manual for more details\n");
-		return -EINVAL;
-	}
+	fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
 
-	return 0;
+	return -EINVAL;
 }
 
 static void end_bio_io_page(struct bio *bio, int error)
@@ -275,7 +218,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+	submit_bio(READ_SYNC | REQ_META, bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
 	if (!PageUptodate(page)) {
@@ -587,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+	for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -673,7 +616,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
-		slow_work_init(&jd->jd_work, &gfs2_recover_ops);
+		INIT_WORK(&jd->jd_work, gfs2_recover_func);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
@@ -782,7 +725,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	if (sdp->sd_lockstruct.ls_first) {
 		unsigned int x;
 		for (x = 0; x < sdp->sd_journals; x++) {
-			error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+			error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
+						     true);
 			if (error) {
 				fs_err(sdp, "error recovering journal %u: %d\n",
 				       x, error);
@@ -792,7 +736,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 
 		gfs2_others_may_mount(sdp);
 	} else if (!sdp->sd_args.ar_spectator) {
-		error = gfs2_recover_journal(sdp->sd_jdesc);
+		error = gfs2_recover_journal(sdp->sd_jdesc, true);
 		if (error) {
 			fs_err(sdp, "error recovering my journal: %d\n", error);
 			goto fail_jinode_gh;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 	if (!strcmp("lock_nolock", proto)) {
 		lm = &nolock_ops;
 		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 	} else if (!strcmp("lock_dlm", proto)) {
 		lm = &gfs2_dlm_ops;
@@ -1050,7 +993,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 			ret = match_int(&tmp[0], &option);
 			if (ret || option < 0) 
 				goto hostdata_error;
-			ls->ls_jid = option;
+			if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+				ls->ls_jid = option;
 			break;
 		case Opt_id:
 			/* Obsolete, but left for backward compat purposes */
@@ -1102,6 +1046,22 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 		lm->lm_unmount(sdp);
 }
 
+static int gfs2_journalid_wait(void *word)
+{
+	if (signal_pending(current))
+		return -EINTR;
+	schedule();
+	return 0;
+}
+
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		return 0;
+
+	return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
+}
+
 void gfs2_online_uevent(struct gfs2_sbd *sdp)
 {
 	struct super_block *sb = sdp->sd_vfs;
@@ -1194,6 +1154,24 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	if (error)
 		goto fail_locking;
 
+	error = wait_on_journal(sdp);
+	if (error)
+		goto fail_sb;
+
+	/*
+	 * If user space has failed to join the cluster or some similar
+	 * failure has occurred, then the journal id will contain a
+	 * negative (error) number. This will then be returned to the
+	 * caller (of the mount syscall). We do this even for spectator
+	 * mounts (which just write a jid of 0 to indicate "ok" even though
+	 * the jid is unused in the spectator case)
+	 */
+	if (sdp->sd_lockstruct.ls_jid < 0) {
+		error = sdp->sd_lockstruct.ls_jid;
+		sdp->sd_lockstruct.ls_jid = 0;
+		goto fail_sb;
+	}
+
 	error = init_inodes(sdp, DO);
 	if (error)
 		goto fail_sb;
@@ -1241,7 +1219,6 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	invalidate_inodes(sb);
 	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 fail_sys:
@@ -1273,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 }
 
 /**
- * gfs2_get_sb - Get the GFS2 superblock
+ * gfs2_mount - Get the GFS2 superblock
  * @fs_type: The GFS2 filesystem type
  * @flags: Mount flags
  * @dev_name: The name of the device
  * @data: The mount arguments
- * @mnt: The vfsmnt for this mount
  *
  * Q. Why not use get_sb_bdev() ?
  * A. We need to select one of two root directories to mount, independent
@@ -1287,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
  * Returns: 0 or -ve on error
  */
 
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -1302,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 
 	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+		return ERR_CAST(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -1321,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(s))
 		goto error_bdev;
 
+	if (s->s_root)
+		close_bdev_exclusive(bdev, mode);
+
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
 	args.ar_data = GFS2_DATA_DEFAULT;
@@ -1332,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	error = gfs2_mount_args(&args, data);
 	if (error) {
 		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-		if (s->s_root)
-			goto error_super;
-		deactivate_locked_super(s);
-		return error;
+		goto error_super;
 	}
 
 	if (s->s_root) {
 		error = -EBUSY;
 		if ((flags ^ s->s_flags) & MS_RDONLY)
 			goto error_super;
-		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -1351,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return error;
-		}
+		if (error)
+			goto error_super;
 		s->s_flags |= MS_ACTIVE;
 		bdev->bd_super = s;
 	}
 
 	sdp = s->s_fs_info;
-	mnt->mnt_sb = s;
 	if (args.ar_meta)
-		mnt->mnt_root = dget(sdp->sd_master_dir);
+		return dget(sdp->sd_master_dir);
 	else
-		mnt->mnt_root = dget(sdp->sd_root_dir);
-	return 0;
+		return dget(sdp->sd_root_dir);
 
 error_super:
 	deactivate_locked_super(s);
+	return ERR_PTR(error);
 error_bdev:
 	close_bdev_exclusive(bdev, mode);
-	return error;
+	return ERR_PTR(error);
 }
 
 static int set_meta_super(struct super_block *s, void *ptr)
@@ -1379,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
 	return -EINVAL;
 }
 
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
 {
 	struct super_block *s;
 	struct gfs2_sbd *sdp;
@@ -1391,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return error;
+		return ERR_PTR(error);
 	}
 	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 	}
 	if ((flags ^ s->s_flags) & MS_RDONLY) {
 		deactivate_locked_super(s);
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 	}
 	sdp = s->s_fs_info;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = dget(sdp->sd_master_dir);
-	return 0;
+	return dget(sdp->sd_master_dir);
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1433,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 struct file_system_type gfs2_fs_type = {
 	.name = "gfs2",
 	.fs_flags = FS_REQUIRES_DEV,
-	.get_sb = gfs2_get_sb,
+	.mount = gfs2_mount,
 	.kill_sb = gfs2_kill_sb,
 	.owner = THIS_MODULE,
 };
@@ -1441,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
 struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
-	.get_sb = gfs2_get_sb_meta,
+	.mount = gfs2_mount_meta,
 	.owner = THIS_MODULE,
 };
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 98cdd05f331..12cbea7502c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 2 * RES_DINODE + RES_STATFS +
 					 RES_QUOTA, 0);
 		if (error)
@@ -253,7 +255,7 @@ out_parent:
 	gfs2_holder_uninit(ghs);
 	gfs2_holder_uninit(ghs + 1);
 	if (!error) {
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		d_instantiate(dentry, inode);
 		mark_inode_dirty(inode);
 	}
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_disksize = size;
 	i_size_write(inode, size);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_inode.i_nlink = 2;
-	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
 	ip->i_diskflags |= GFS2_DIF_JDATA;
 	ip->i_entries = 2;
 
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!gfs2_assert_withdraw(sdp, !error)) {
 		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
 		struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-		struct qstr str;
 
-		gfs2_str2qstr(&str, ".");
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+		gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
 		dent->de_inum = di->di_num; /* already GFS2 endian */
 		dent->de_type = cpu_to_be16(DT_DIR);
 		di->di_entries = cpu_to_be32(1);
 
-		gfs2_str2qstr(&str, "..");
 		dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-		gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+		gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
 
 		gfs2_inum_out(dip, dent);
 		dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip)
 {
-	struct qstr dotname;
 	int error;
 
 	if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdot);
 	if (error)
 		return error;
 
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
+	error = gfs2_dir_del(ip, &gfs2_qdotdot);
 	if (error)
 		return error;
 
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 	struct inode *dir = &to->i_inode;
 	struct super_block *sb = dir->i_sb;
 	struct inode *tmp;
-	struct qstr dotdot;
 	int error = 0;
 
-	gfs2_str2qstr(&dotdot, "..");
-
 	igrab(dir);
 
 	for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 			break;
 		}
 
-		tmp = gfs2_lookupi(dir, &dotdot, 1);
+		tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
 		if (IS_ERR(tmp)) {
 			error = PTR_ERR(tmp);
 			break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		return error;
 
 	if (odip != ndip) {
 		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 		al->al_requested = sdp->sd_max_dirres;
 
-		error = gfs2_inplace_reserve(ndip);
+		error = gfs2_inplace_reserve_ri(ndip);
 		if (error)
 			goto out_gunlock_q;
 
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 al->al_rgd->rd_length +
+					 gfs2_rg_blocks(al) +
 					 4 * RES_DINODE + 4 * RES_LEAF +
 					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	}
 
 	if (dir_rename) {
-		struct qstr name;
-		gfs2_str2qstr(&name, "..");
-
 		error = gfs2_change_nlink(ndip, +1);
 		if (error)
 			goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_end_trans;
 
-		error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
 		if (error)
 			goto out_end_trans;
 	} else {
@@ -972,6 +964,7 @@ out_gunlock_r:
 	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
+	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
 
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
-	unsigned int x;
+	unsigned int x, size;
 	char *buf;
 	int error;
 
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		return NULL;
 	}
 
-	if (!ip->i_disksize) {
+	size = (unsigned int)i_size_read(&ip->i_inode);
+	if (size == 0) {
 		gfs2_consist_inode(ip);
 		buf = ERR_PTR(-EIO);
 		goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto out;
 	}
 
-	x = ip->i_disksize + 1;
+	x = size + 1;
 	buf = kmalloc(x, GFP_NOFS);
 	if (!buf)
 		buf = ERR_PTR(-ENOMEM);
@@ -1071,32 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-/*
- * XXX: should be changed to have proper ordering by opencoding simple_setsize
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
-
-	if (attr->ia_size != ip->i_disksize) {
-		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-		if (error)
-			return error;
-		error = simple_setsize(inode, attr->ia_size);
-		gfs2_trans_end(sdp);
-		if (error) 
-			return error;
-	}
-
-	error = gfs2_truncatei(ip, attr->ia_size);
-	if (error && (inode->i_size != ip->i_disksize))
-		i_size_write(inode, ip->i_disksize);
-
-	return error;
-}
-
 static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1136,8 +1104,16 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_end_trans;
 
-	error = inode_setattr(inode, attr);
-	gfs2_assert_warn(sdp, !error);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		gfs2_assert_warn(sdp, !error);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1189,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 
 	if (attr->ia_valid & ATTR_SIZE)
-		error = setattr_size(inode, attr);
+		error = gfs2_setattr_size(inode, attr->ia_size);
 	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
 		error = setattr_chown(inode, attr);
 	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1295,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return ret;
 }
 
+static void empty_write_end(struct page *page, unsigned from,
+			   unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+	page_zero_new_buffers(page, from, to);
+	flush_dcache_page(page);
+	mark_page_accessed(page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	unsigned start, end, next;
+	struct buffer_head *bh, *head;
+	int error;
+
+	if (!page_has_buffers(page)) {
+		error = __block_write_begin(page, from, to - from, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+
+		empty_write_end(page, from, to);
+		return 0;
+	}
+
+	bh = head = page_buffers(page);
+	next = end = 0;
+	while (next < from) {
+		next += bh->b_size;
+		bh = bh->b_this_page;
+	}
+	start = next;
+	do {
+		next += bh->b_size;
+		if (buffer_mapped(bh)) {
+			if (end) {
+				error = __block_write_begin(page, start, end - start,
+							    gfs2_block_map);
+				if (unlikely(error))
+					return error;
+				empty_write_end(page, start, end);
+				end = 0;
+			}
+			start = next;
+		}
+		else
+			end = next;
+		bh = bh->b_this_page;
+	} while (next < to);
+
+	if (end) {
+		error = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(error))
+			return error;
+		empty_write_end(page, start, end);
+	}
+
+	return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+			   int mode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int error;
+	u64 start = offset >> PAGE_CACHE_SHIFT;
+	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t curr;
+	struct page *page;
+	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+	unsigned int from, to;
+
+	if (!end_offset)
+		end_offset = PAGE_CACHE_SIZE;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(error))
+		goto out;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (unlikely(error))
+			goto out;
+	}
+
+	curr = start;
+	offset = start << PAGE_CACHE_SHIFT;
+	from = start_offset;
+	to = PAGE_CACHE_SIZE;
+	while (curr <= end) {
+		page = grab_cache_page_write_begin(inode->i_mapping, curr,
+						   AOP_FLAG_NOFS);
+		if (unlikely(!page)) {
+			error = -ENOMEM;
+			goto out;
+		}
+
+		if (curr == end)
+			to = end_offset;
+		error = write_empty_blocks(page, from, to);
+		if (!error && offset + to > inode->i_size &&
+		    !(mode & FALLOC_FL_KEEP_SIZE)) {
+			i_size_write(inode, offset + to);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+		if (error)
+			goto out;
+		curr++;
+		offset += PAGE_CACHE_SIZE;
+		from = 0;
+	}
+
+	gfs2_dinode_out(ip, dibh->b_data);
+	mark_inode_dirty(inode);
+
+	brelse(dibh);
+
+out:
+	return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+	unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		max_data -= tmp;
+	}
+	/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+	   so it might end up with fewer data blocks */
+	if (max_data <= *data_blocks)
+		return;
+	*data_blocks = max_data;
+	*ind_blocks = max_blocks - max_data;
+	*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+	if (*len > max) {
+		*len = max;
+		gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+	}
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+			   loff_t len)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	loff_t bytes, max_bytes;
+	struct gfs2_alloc *al;
+	int error;
+	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+	offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+		 sdp->sd_sb.sb_bsize_shift;
+
+	len = next - offset;
+	bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+	if (!bytes)
+		bytes = UINT_MAX;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	if (!gfs2_write_alloc_required(ip, offset, len))
+		goto out_unlock;
+
+	while (len > 0) {
+		if (len < bytes)
+			bytes = len;
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+retry:
+		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error) {
+			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+				bytes >>= 1;
+				goto retry;
+			}
+			goto out_qunlock;
+		}
+		max_bytes = bytes;
+		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		al->al_requested = data_blocks + ind_blocks;
+
+		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+			  RES_RG_HDR + gfs2_rg_blocks(al);
+		if (gfs2_is_jdata(ip))
+			rblocks += data_blocks ? data_blocks : 1;
+
+		error = gfs2_trans_begin(sdp, rblocks,
+					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+		if (error)
+			goto out_trans_fail;
+
+		error = fallocate_chunk(inode, offset, max_bytes, mode);
+		gfs2_trans_end(sdp);
+
+		if (error)
+			goto out_trans_fail;
+
+		len -= max_bytes;
+		offset += max_bytes;
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	goto out_unlock;
+
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_qunlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len)
 {
@@ -1345,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fallocate = gfs2_fallocate,
 	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8f02d3db8f4..58a9b9998b4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
 		goto out;
 
 	size = loc + sizeof(struct gfs2_quota);
-	if (size > inode->i_size) {
-		ip->i_disksize = size;
+	if (size > inode->i_size)
 		i_size_write(inode, size);
-	}
 	inode->i_mtime = inode->i_atime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -787,15 +785,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		goto out;
 
 	for (x = 0; x < num_qd; x++) {
-		int alloc_required;
-
 		offset = qd2offset(qda[x]);
-		error = gfs2_write_alloc_required(ip, offset,
-						  sizeof(struct gfs2_quota),
-						  &alloc_required);
-		if (error)
-			goto out_gunlock;
-		if (alloc_required)
+		if (gfs2_write_alloc_required(ip, offset,
+					      sizeof(struct gfs2_quota)))
 			nalloc++;
 	}
 
@@ -823,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		goto out_alloc;
 
 	if (nalloc)
-		blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+		blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, blocks, 0);
 	if (error)
@@ -1196,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-	unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+	u64 size = i_size_read(sdp->sd_qc_inode);
+	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
 
-	if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-	    ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
 		return -EIO;
-	}
+
 	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
 	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
 
@@ -1455,10 +1446,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 
 	switch (sdp->sd_args.ar_quota) {
 	case GFS2_QUOTA_ON:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
 		/*FALLTHRU*/
 	case GFS2_QUOTA_ACCOUNT:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
 		break;
 	case GFS2_QUOTA_OFF:
 		break;
@@ -1504,7 +1495,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
 	fdq->d_id = id;
 	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
 	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1539,12 +1530,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 	switch(type) {
 	case USRQUOTA:
 		type = QUOTA_USER;
-		if (fdq->d_flags != XFS_USER_QUOTA)
+		if (fdq->d_flags != FS_USER_QUOTA)
 			return -EINVAL;
 		break;
 	case GRPQUOTA:
 		type = QUOTA_GROUP;
-		if (fdq->d_flags != XFS_GROUP_QUOTA)
+		if (fdq->d_flags != FS_GROUP_QUOTA)
 			return -EINVAL;
 		break;
 	default:
@@ -1584,10 +1575,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		goto out_i;
 
 	offset = qd2offset(qd);
-	error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
-					  &alloc_required);
-	if (error)
-		goto out_i;
+	alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
 		if (al == NULL)
@@ -1598,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		error = gfs2_inplace_reserve(ip);
 		if (error)
 			goto out_alloc;
+		blocks += gfs2_rg_blocks(al);
 	}
 
 	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d43..f2a02edcac8 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -28,6 +27,8 @@
 #include "util.h"
 #include "dir.h"
 
+struct workqueue_struct *gfs_recovery_wq;
+
 int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
 			   struct buffer_head **bh)
 {
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
         kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-static int gfs2_recover_get_ref(struct slow_work *work)
-{
-	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
-		return -EBUSY;
-	return 0;
-}
-
-static void gfs2_recover_put_ref(struct slow_work *work)
-{
-	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-	clear_bit(JDF_RECOVERY, &jd->jd_flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
-}
-
-static void gfs2_recover_work(struct slow_work *work)
+void gfs2_recover_func(struct work_struct *work)
 {
 	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -470,11 +455,13 @@ static void gfs2_recover_work(struct slow_work *work)
 	int ro = 0;
 	unsigned int pass;
 	int error;
+	int jlocked = 0;
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (sdp->sd_args.ar_spectator ||
+	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
 		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
 			jd->jd_jid);
-
+		jlocked = 1;
 		/* Acquire the journal lock so we can do recovery */
 
 		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -569,53 +556,55 @@ static void gfs2_recover_work(struct slow_work *work)
 			jd->jd_jid, t);
 	}
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-		gfs2_glock_dq_uninit(&ji_gh);
-
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+	if (jlocked) {
+		gfs2_glock_dq_uninit(&ji_gh);
 		gfs2_glock_dq_uninit(&j_gh);
+	}
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-	return;
+	goto done;
 
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
 fail_gunlock_ji:
-	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+	if (jlocked) {
 		gfs2_glock_dq_uninit(&ji_gh);
 fail_gunlock_j:
 		gfs2_glock_dq_uninit(&j_gh);
 	}
 
 	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
-
 fail:
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+done:
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
 }
 
-struct slow_work_ops gfs2_recover_ops = {
-	.owner	 = THIS_MODULE,
-	.get_ref = gfs2_recover_get_ref,
-	.put_ref = gfs2_recover_put_ref,
-	.execute = gfs2_recover_work,
-};
-
-
 static int gfs2_recovery_wait(void *word)
 {
 	schedule();
 	return 0;
 }
 
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
 {
 	int rv;
-	rv = slow_work_enqueue(&jd->jd_work);
-	if (rv)
-		return rv;
-	wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
+
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+
+	/* we have JDF_RECOVERY, queue should always succeed */
+	rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+	BUG_ON(!rv);
+
+	if (wait)
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
+			    TASK_UNINTERRUPTIBLE);
+
 	return 0;
 }
 
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569..2226136c764 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
 
 #include "incore.h"
 
+extern struct workqueue_struct *gfs_recovery_wq;
+
 static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 {
 	if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern struct slow_work_ops gfs2_recover_ops;
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+extern void gfs2_recover_func(struct work_struct *work);
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e4..bef3ab6cf5c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+		if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = ip->i_disksize;
+	u64 rgrp_count = i_size_read(inode);
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 		}
 	}
 
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
+	struct gfs2_rgrpd *rgd;
+	unsigned int max_data = 0;
 	int error;
 
 	file_ra_state_init(&ra_state, inode->i_mapping);
 	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
 		/* Ignore partials */
 		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    ip->i_disksize)
+		    i_size_read(inode))
 			break;
 		error = read_rindex_entry(ip, &ra_state);
 		if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 			return error;
 		}
 	}
+	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+		if (rgd->rd_data > max_data)
+			max_data = rgd->rd_data;
+	sdp->sd_max_rg_data = max_data;
 
 	sdp->sd_rindex_uptodate = 1;
 	return 0;
@@ -854,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT |
-							    BLKDEV_IFL_BARRIER);
+							    0);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -869,8 +880,7 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
 		if (rv)
 			goto fail;
 	}
@@ -1188,7 +1198,8 @@ out:
  * Returns: errno
  */
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+			   char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1210,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 		return -EINVAL;
 
 try_again:
-	/* We need to hold the rindex unless the inode we're using is
-	   the rindex itself, in which case it's already held. */
-	if (ip != GFS2_I(sdp->sd_rindex))
-		error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-	else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-		error = gfs2_ri_update_special(ip);
+	if (hold_rindex) {
+		/* We need to hold the rindex unless the inode we're using is
+		   the rindex itself, in which case it's already held. */
+		if (ip != GFS2_I(sdp->sd_rindex))
+			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+		else if (!sdp->sd_rgrps) /* We may not have the rindex read
+					    in, so: */
+			error = gfs2_ri_update_special(ip);
+	}
 
 	if (error)
 		return error;
@@ -1215,7 +1229,7 @@ try_again:
 	   try to free it, and try the allocation again. */
 	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
 	if (error) {
-		if (ip != GFS2_I(sdp->sd_rindex))
+		if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
 		if (error != -EAGAIN)
 			return error;
@@ -1257,7 +1271,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 	al->al_rgd = NULL;
 	if (al->al_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	if (ip != GFS2_I(sdp->sd_rindex))
+	if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
 		gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
 
@@ -1496,11 +1510,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd = al->al_rgd;
+	struct gfs2_rgrpd *rgd;
 	u32 goal, blk;
 	u64 block;
 	int error;
 
+	/* Only happens if there is a bug in gfs2, return something distinctive
+	 * to ensure that it is noticed.
+	 */
+	if (al == NULL)
+		return -ECANCELED;
+
+	rgd = al->al_rgd;
+
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
 	else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d8955..0e35c0466f9 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-				  unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+				  char *file, unsigned int line);
 #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+	gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+	gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4d1aad38f1b..2b2c4997430 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
 	{Opt_locktable, "locktable=%s"},
 	{Opt_hostdata, "hostdata=%s"},
 	{Opt_spectator, "spectator"},
+	{Opt_spectator, "norecovery"},
 	{Opt_ignore_local_fs, "ignore_local_fs"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_spectator = 1;
 			break;
 		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_localflocks:
 			args->ar_localflocks = 1;
 			break;
 		case Opt_localcaching:
-			args->ar_localcaching = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_debug:
 			if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 			args->ar_debug = 0;
 			break;
 		case Opt_upgrade:
-			args->ar_upgrade = 1;
+			/* Retained for backwards compat only */
 			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
@@ -342,23 +343,19 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-	int ar;
-	int error;
+	u64 size = i_size_read(jd->jd_inode);
 
-	if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-	    (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-		gfs2_consist_inode(ip);
+	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
 		return -EIO;
-	}
-	jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 
-	error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
-	if (!error && ar) {
+	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+	if (gfs2_write_alloc_required(ip, 0, size)) {
 		gfs2_consist_inode(ip);
-		error = -EIO;
+		return -EIO;
 	}
 
-	return error;
+	return 0;
 }
 
 /**
@@ -860,7 +857,6 @@ restart:
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
-	invalidate_inodes(sdp->sd_vfs);
 	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
@@ -1132,9 +1128,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	/* Some flags must not be changed */
 	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
 	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
 	    args_neq(&args, &sdp->sd_args, meta))
 		return -EINVAL;
 
@@ -1191,7 +1185,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  * node for later deallocation.
  */
 
-static void gfs2_drop_inode(struct inode *inode)
+static int gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
@@ -1200,26 +1194,7 @@ static void gfs2_drop_inode(struct inode *inode)
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
 	}
-	generic_drop_inode(inode);
-}
-
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-
-static void gfs2_clear_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	ip->i_gl->gl_object = NULL;
-	gfs2_glock_put(ip->i_gl);
-	ip->i_gl = NULL;
-	if (ip->i_iopen_gh.gh_gl) {
-		ip->i_iopen_gh.gh_gl->gl_object = NULL;
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-	}
+	return generic_drop_inode(inode);
 }
 
 static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
@@ -1256,16 +1231,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
 	if (args->ar_localflocks)
 		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
 	if (args->ar_debug)
 		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
 	if (args->ar_posix_acl)
 		seq_printf(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -1347,13 +1316,16 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
  * is safe, just less efficient.
  */
 
-static void gfs2_delete_inode(struct inode *inode)
+static void gfs2_evict_inode(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int error;
 
+	if (inode->i_nlink)
+		goto out;
+
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (unlikely(error)) {
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
@@ -1407,10 +1379,18 @@ out_unlock:
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
 	if (error && error != GLR_TRYFAILED && error != -EROFS)
-		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
+	end_writeback(inode);
+
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+	ip->i_gl = NULL;
+	if (ip->i_iopen_gh.gh_gl) {
+		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	}
 }
 
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
@@ -1434,14 +1414,13 @@ const struct super_operations gfs2_super_ops = {
 	.alloc_inode		= gfs2_alloc_inode,
 	.destroy_inode		= gfs2_destroy_inode,
 	.write_inode		= gfs2_write_inode,
-	.delete_inode		= gfs2_delete_inode,
+	.evict_inode		= gfs2_evict_inode,
 	.put_super		= gfs2_put_super,
 	.sync_fs		= gfs2_sync_fs,
 	.freeze_fs 		= gfs2_freeze,
 	.unfreeze_fs		= gfs2_unfreeze,
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
-	.clear_inode		= gfs2_clear_inode,
 	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 };
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 37f5393e68e..748ccb557c1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
 #include "quota.h"
 #include "util.h"
 #include "glops.h"
+#include "recovery.h"
 
 struct gfs2_attr {
 	struct attribute attr;
@@ -229,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 
 	if (gltype > LM_TYPE_JOURNAL)
 		return -EINVAL;
-	glops = gfs2_glops_list[gltype];
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+		glops = &gfs2_trans_glops;
+	else
+		glops = gfs2_glops_list[gltype];
 	if (glops == NULL)
 		return -EINVAL;
 	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -325,6 +329,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_first);
 }
 
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	unsigned first;
+	int rv;
+
+	rv = sscanf(buf, "%u", &first);
+	if (rv != 1 || first > 1)
+		return -EINVAL;
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = -EINVAL;
+	if (sdp->sd_args.ar_spectator)
+		goto out;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+        sdp->sd_lockstruct.ls_first = first;
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
+
 static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -352,7 +380,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		if (jd->jd_jid != jid)
 			continue;
-		rv = slow_work_enqueue(&jd->jd_work);
+		rv = gfs2_recover_journal(jd, false);
 		break;
 	}
 out:
@@ -374,7 +402,35 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 {
-	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+	return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
+}
+
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        int jid;
+	int rv;
+
+	rv = sscanf(buf, "%d", &jid);
+	if (rv != 1)
+		return -EINVAL;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	rv = -EINVAL;
+	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+		goto out;
+	rv = -EBUSY;
+	if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+		goto out;
+	rv = 0;
+	if (sdp->sd_args.ar_spectator && jid > 0)
+		rv = jid = -EINVAL;
+	sdp->sd_lockstruct.ls_jid = jid;
+	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+out:
+	spin_unlock(&sdp->sd_jindex_spin);
+	return rv ? rv : len;
 }
 
 #define GDLM_ATTR(_name,_mode,_show,_store) \
@@ -383,8 +439,8 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
 GDLM_ATTR(block,		0644, block_show,		block_store);
 GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
-GDLM_ATTR(jid,			0444, jid_show,			NULL);
-GDLM_ATTR(first,		0444, lkfirst_show,		NULL);
+GDLM_ATTR(jid,			0644, jid_show,			jid_store);
+GDLM_ATTR(first,		0644, lkfirst_show,		lkfirst_store);
 GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
 GDLM_ATTR(recover,		0600, NULL,			recover_store);
 GDLM_ATTR(recover_done,		0444, recover_done_show,	NULL);
@@ -564,8 +620,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
-	if (!sdp->sd_args.ar_spectator)
-		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+	if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+		add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
 	if (gfs2_uuid_valid(uuid))
 		add_uevent_var(env, "UUID=%pUB", uuid);
 	return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c1417..cedb0bb96d9 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
 	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
 	{(1UL << GLF_REPLY_PENDING),		"r" },		\
 	{(1UL << GLF_INITIAL),			"I" },		\
-	{(1UL << GLF_FROZEN),			"F" })
+	{(1UL << GLF_FROZEN),			"F" },		\
+	{(1UL << GLF_QUEUED),			"q" })
 
 #ifndef NUMPTY
 #define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908..fb56b783e02 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
 #define RES_JDATA	1
 #define RES_DATA	1
 #define RES_LEAF	1
+#define RES_RG_HDR	1
 #define RES_RG_BIT	2
 #define RES_EATTR	1
 #define RES_STATFS	1
 #define RES_QUOTA	2
 
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+	return (al->al_requested < al->al_rgd->rd_length)?
+	       al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 		     unsigned int revokes);
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 82f93da00d1..30b58f07c8a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		goto out_gunlock_q;
 
 	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + al->al_rgd->rd_length +
+				 blks + gfs2_rg_blocks(al) +
 				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipres;
@@ -1296,6 +1296,7 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
+	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_location el;
 	struct buffer_head *dibh;
@@ -1321,14 +1322,25 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 		return error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
+	if (error)
+		goto out_trans_end;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		gfs2_assert_warn(GFS2_SB(inode), !error);
 	}
 
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+out_trans_end:
 	gfs2_trans_end(sdp);
 	return error;
 }
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d..571abe97b42 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7..3ebc437736f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	/* Set the correct compare function */
 	tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21..2a1d712f85d 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fe35e3b626c..c8cffb81e84 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
 	u16 blockoffset;
 
 	int fs_div;
-
-	struct hlist_head rsrc_inodes;
 };
 
 #define HFS_FLG_BITMAP_DIRTY	0
@@ -193,7 +191,7 @@ extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
 extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
-extern void hfs_clear_inode(struct inode *);
+extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
 	sb->s_dirt = 1;
 }
 
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-	while (buffer_locked(bh)) {
-		wait_on_buffer(bh);
-	}
-	if (buffer_dirty(bh)) {
-		ll_rw_block(WRITE, 1, &bh);
-		wait_on_buffer(bh);
-	}
-}
-
 #define sb_bread512(sb, sec, data) ({			\
 	struct buffer_head *__bh;			\
 	sector_t __block;				\
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1b9fd..dffb4e99664 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -39,10 +39,19 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfs_get_block,
 				&HFS_I(mapping->host)->phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
@@ -112,9 +121,24 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 static int hfs_writepages(struct address_space *mapping,
@@ -500,15 +524,17 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 	HFS_I(inode)->rsrc_inode = dir;
 	HFS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+	hlist_add_fake(&inode->i_hash);
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
 	return NULL;
 }
 
-void hfs_clear_inode(struct inode *inode)
+void hfs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
 		HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
 		iput(HFS_I(inode)->rsrc_inode);
@@ -588,13 +614,43 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 			attr->ia_mode = inode->i_mode & ~S_IWUGO;
 		attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
 	}
-	error = inode_setattr(inode, attr);
-	if (error)
-		return error;
 
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 	return 0;
 }
 
+static int hfs_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	if (sb->s_dirt) {
+		lock_super(sb);
+		sb->s_dirt = 0;
+		if (!(sb->s_flags & MS_RDONLY))
+			hfs_mdb_commit(sb);
+		unlock_super(sb);
+	}
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
 
 static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
@@ -604,7 +660,7 @@ static const struct file_operations hfs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.fsync		= file_fsync,
+	.fsync		= hfs_file_fsync,
 	.open		= hfs_file_open,
 	.release	= hfs_file_release,
 };
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac99..1563d5ce576 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
 		mdb->drLsMod = hfs_mtime();
 
 		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-		hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
 	}
 
 	return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
 		HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
 		HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
 		mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-		hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+		sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
 	}
 
 	if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0a81eb7111f..4824c27cebb 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 
 #include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
  */
 static void hfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	if (sb->s_dirt)
 		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
-
-	unlock_kernel();
 }
 
 /*
@@ -181,7 +176,7 @@ static const struct super_operations hfs_super_operations = {
 	.alloc_inode	= hfs_alloc_inode,
 	.destroy_inode	= hfs_destroy_inode,
 	.write_inode	= hfs_write_inode,
-	.clear_inode	= hfs_clear_inode,
+	.evict_inode	= hfs_evict_inode,
 	.put_super	= hfs_put_super,
 	.write_super	= hfs_write_super,
 	.sync_fs	= hfs_sync_fs,
@@ -385,8 +380,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
 	sb->s_fs_info = sbi;
-	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 
 	res = -EINVAL;
 	if (!parse_options((char *)data, sbi)) {
@@ -446,17 +441,16 @@ bail:
 	return res;
 }
 
-static int hfs_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
 }
 
 static struct file_system_type hfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hfs",
-	.get_sb		= hfs_get_sb,
+	.mount		= hfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be..d182438c7ae 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
 	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-	down(&tree->tree_lock);
+	mutex_lock(&tree->tree_lock);
 	return 0;
 }
 
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
 	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-	up(&fd->tree->tree_lock);
+	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 		rec = (e + b) / 2;
 		len = hfs_brec_lenoff(bnode, rec, &off);
 		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
 		if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 	if (rec != e && e >= 0) {
 		len = hfs_brec_lenoff(bnode, e, &off);
 		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 	}
 done:
@@ -75,6 +83,7 @@ done:
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
 	fd->entrylength = len - keylen;
+fail:
 	return res;
 }
 
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
 
 	len = hfs_brec_lenoff(bnode, fd->record, &off);
 	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == 0) {
+		res = -EINVAL;
+		goto out;
+	}
 	fd->keyoffset = off;
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03..ad57f5991eb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
 
 int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	if (IS_ERR(page)) {
 		start = size;
@@ -150,16 +151,17 @@ done:
 	set_page_dirty(page);
 	kunmap(page);
 	*max = offset + (curr - pptr) * 32 + i - start;
-	HFSPLUS_SB(sb).free_blocks -= *max;
+	sbi->free_blocks -= *max;
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 	return start;
 }
 
 int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
 	struct address_space *mapping;
 	__be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 
 	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
 	/* are all of the bits in range? */
-	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+	if ((offset + count) > sbi->total_blocks)
 		return -2;
 
-	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+	mutex_lock(&sbi->alloc_mutex);
+	mapping = sbi->alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
 out:
 	set_page_dirty(page);
 	kunmap(page);
-	HFSPLUS_SB(sb).free_blocks += len;
+	sbi->free_blocks += len;
 	sb->s_dirt = 1;
-	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+	mutex_unlock(&sbi->alloc_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a40..2f39d05443e 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
-		if (node->tree->attributes & HFS_TREE_BIGKEYS)
-			retval = hfs_bnode_read_u16(node, recoff) + 2;
-		else
-			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+
+		retval = hfs_bnode_read_u16(node, recoff) + 2;
+		if (retval > node->tree->max_key_len + 2) {
+			printk(KERN_ERR "hfs: keylen %d too large\n",
+				retval);
+			retval = 0;
+		}
 	}
 	return retval;
 }
@@ -216,7 +219,7 @@ skip:
 static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 {
 	struct hfs_btree *tree;
-	struct hfs_bnode *node, *new_node;
+	struct hfs_bnode *node, *new_node, *next_node;
 	struct hfs_bnode_desc node_desc;
 	int num_recs, new_rec_off, new_off, old_rec_off;
 	int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	new_node->type = node->type;
 	new_node->height = node->height;
 
+	if (node->next)
+		next_node = hfs_bnode_find(tree, node->next);
+	else
+		next_node = NULL;
+
+	if (IS_ERR(next_node)) {
+		hfs_bnode_put(node);
+		hfs_bnode_put(new_node);
+		return next_node;
+	}
+
 	size = tree->node_size / 2 - node->num_recs * 2 - 14;
 	old_rec_off = tree->node_size - 4;
 	num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 		/* panic? */
 		hfs_bnode_put(node);
 		hfs_bnode_put(new_node);
+		if (next_node)
+			hfs_bnode_put(next_node);
 		return ERR_PTR(-ENOSPC);
 	}
 
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
 
 	/* update next bnode header */
-	if (new_node->next) {
-		struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+	if (next_node) {
 		next_node->prev = new_node->this;
 		hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
 		node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e29..22e4d4e3299 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	if (!tree)
 		return NULL;
 
-	init_MUTEX(&tree->tree_lock);
+	mutex_init(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
 	tree->sb = sb;
 	tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 	tree->inode = inode;
 
+	if (!HFSPLUS_I(tree->inode)->first_blocks) {
+		printk(KERN_ERR
+		       "hfs: invalid btree extent records (0 size).\n");
+		goto free_inode;
+	}
+
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto free_tree;
+		goto free_inode;
 
 	/* Load the header */
 	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	tree->max_key_len = be16_to_cpu(head->max_key_len);
 	tree->depth = be16_to_cpu(head->depth);
 
-	/* Set the correct compare function */
-	if (id == HFSPLUS_EXT_CNID) {
+	/* Verify the tree and set the correct compare function */
+	switch (id) {
+	case HFSPLUS_EXT_CNID:
+		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+			printk(KERN_ERR "hfs: invalid extent btree flag\n");
+			goto fail_page;
+		}
+
 		tree->keycmp = hfsplus_ext_cmp_key;
-	} else if (id == HFSPLUS_CAT_CNID) {
-		if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+		break;
+	case HFSPLUS_CAT_CNID:
+		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+			printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+			goto fail_page;
+		}
+
+		if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
 		    (head->key_type == HFSPLUS_KEY_BINARY))
 			tree->keycmp = hfsplus_cat_bin_cmp_key;
 		else {
 			tree->keycmp = hfsplus_cat_case_cmp_key;
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 		}
-	} else {
+		break;
+	default:
 		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
 		goto fail_page;
 	}
 
+	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+		printk(KERN_ERR "hfs: invalid btree flag\n");
+		goto fail_page;
+	}
+
 	size = tree->node_size;
 	if (!is_power_of_2(size))
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
+
 	tree->node_size_shift = ffs(size) - 1;
 
 	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	return tree;
 
  fail_page:
-	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	page_cache_release(page);
- free_tree:
+ free_inode:
+	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	iput(tree->inode);
+ free_tree:
 	kfree(tree);
 	return NULL;
 }
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 
 	while (!tree->free_nodes) {
 		struct inode *inode = tree->inode;
+		struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 		u32 count;
 		int res;
 
 		res = hfsplus_file_extend(inode);
 		if (res)
 			return ERR_PTR(res);
-		HFSPLUS_I(inode).phys_size = inode->i_size =
-				(loff_t)HFSPLUS_I(inode).alloc_blocks <<
-				HFSPLUS_SB(tree->sb).alloc_blksz_shift;
-		HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-					     HFSPLUS_SB(tree->sb).fs_shift;
+		hip->phys_size = inode->i_size =
+			(loff_t)hip->alloc_blocks <<
+				HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+		hip->fs_blocks =
+			hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
 		inode_set_bytes(inode, inode->i_size);
 		count = inode->i_size >> tree->node_size_shift;
 		tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf..8af45fc5b05 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
 	key->key_len = cpu_to_be16(6 + ustrlen);
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 {
 	if (inode->i_flags & S_IMMUTABLE)
 		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->rootflags |= HFSPLUS_FLG_APPEND;
 	else
 		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+
+	perms->userflags = HFSPLUS_I(inode)->userflags;
 	perms->mode = cpu_to_be16(inode->i_mode);
 	perms->owner = cpu_to_be32(inode->i_uid);
 	perms->group = cpu_to_be32(inode->i_gid);
+
+	if (S_ISREG(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_nlink);
+	else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+		perms->dev = cpu_to_be32(inode->i_rdev);
+	else
+		perms->dev = 0;
 }
 
 static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
 	if (S_ISDIR(inode->i_mode)) {
 		struct hfsplus_cat_folder *folder;
 
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		memset(folder, 0, sizeof(*folder));
 		folder->type = cpu_to_be16(HFSPLUS_FOLDER);
 		folder->id = cpu_to_be32(inode->i_ino);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			folder->create_date =
 			folder->content_mod_date =
 			folder->attribute_mod_date =
 			folder->access_date = hfsp_now2mt();
-		hfsplus_set_perms(inode, &folder->permissions);
-		if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+		hfsplus_cat_set_perms(inode, &folder->permissions);
+		if (inode == sbi->hidden_dir)
 			/* invisible and namelocked */
 			folder->user_info.frFlags = cpu_to_be16(0x5000);
 		return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		file->type = cpu_to_be16(HFSPLUS_FILE);
 		file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
 		file->id = cpu_to_be32(cnid);
-		HFSPLUS_I(inode).create_date =
+		HFSPLUS_I(inode)->create_date =
 			file->create_date =
 			file->content_mod_date =
 			file->attribute_mod_date =
 			file->access_date = hfsp_now2mt();
 		if (cnid == inode->i_ino) {
-			hfsplus_set_perms(inode, &file->permissions);
+			hfsplus_cat_set_perms(inode, &file->permissions);
 			if (S_ISLNK(inode->i_mode)) {
 				file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
 				file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
 			} else {
-				file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
-				file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+				file->user_info.fdType = cpu_to_be32(sbi->type);
+				file->user_info.fdCreator = cpu_to_be32(sbi->creator);
 			}
 			if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 				file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 			file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
 			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
 			file->user_info.fdFlags = cpu_to_be16(0x100);
-			file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
-			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+			file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
 		}
 		return sizeof(*file);
 	}
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 
 int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
 {
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
-	struct super_block *sb;
 	hfsplus_cat_entry entry;
 	int entry_size;
 	int err;
 
 	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
 	entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
 
 int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 {
-	struct super_block *sb;
+	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
 	struct hfsplus_fork_raw fork;
 	struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	u16 type;
 
 	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
-	sb = dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	if (!str) {
 		int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
 	}
 
-	list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+	list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
 		struct hfsplus_readdir_data *rd =
 			list_entry(pos, struct hfsplus_readdir_data, list);
 		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
 		       struct inode *src_dir, struct qstr *src_name,
 		       struct inode *dst_dir, struct qstr *dst_name)
 {
-	struct super_block *sb;
+	struct super_block *sb = src_dir->i_sb;
 	struct hfs_find_data src_fd, dst_fd;
 	hfsplus_cat_entry entry;
 	int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
-	sb = src_dir->i_sb;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
 	dst_fd = src_fd;
 
 	/* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca8..9d59c0571f5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &hfsplus_dentry_operations;
 	dentry->d_fsdata = NULL;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
 again:
 	err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
 		cnid = be32_to_cpu(entry.file.id);
 		if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
 		    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
-		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
-		    HFSPLUS_SB(sb).hidden_dir) {
+		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+		    HFSPLUS_SB(sb)->hidden_dir) {
 			struct qstr str;
 			char name[32];
 
@@ -86,7 +86,8 @@ again:
 				linkid = be32_to_cpu(entry.file.permissions.dev);
 				str.len = sprintf(name, "iNode%d", linkid);
 				str.name = name;
-				hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+				hfsplus_cat_build_key(sb, fd.search_key,
+					HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
 				goto again;
 			}
 		} else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (S_ISREG(inode->i_mode))
-		HFSPLUS_I(inode).dev = linkid;
+		HFSPLUS_I(inode)->linkid = linkid;
 out:
 	d_add(dentry, inode);
 	return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (filp->f_pos >= inode->i_size)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
 	err = hfs_brec_find(&fd);
 	if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				err = -EIO;
 				goto out;
 			}
-			if (HFSPLUS_SB(sb).hidden_dir &&
-			    HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+			if (HFSPLUS_SB(sb)->hidden_dir &&
+			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+					be32_to_cpu(entry.folder.id))
 				goto next;
 			if (filldir(dirent, strbuf, len, filp->f_pos,
 				    be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		filp->private_data = rd;
 		rd->file = filp;
-		list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
 out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfsplus_readdir_data *rd = file->private_data;
 	if (rd) {
+		mutex_lock(&inode->i_mutex);
 		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
 		kfree(rd);
 	}
 	return 0;
 }
 
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-			  struct nameidata *nd)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			struct dentry *dst_dentry)
 {
-	struct super_block *sb = dst_dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
 	struct inode *inode = src_dentry->d_inode;
 	struct inode *src_dir = src_dentry->d_parent->d_inode;
 	struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
+	if (!S_ISREG(inode->i_mode))
+		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
 		for (;;) {
 			get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 			str.len = sprintf(name, "iNode%d", id);
 			res = hfsplus_rename_cat(inode->i_ino,
 						 src_dir, &src_dentry->d_name,
-						 HFSPLUS_SB(sb).hidden_dir, &str);
+						 sbi->hidden_dir, &str);
 			if (!res)
 				break;
 			if (res != -EEXIST)
-				return res;
+				goto out;
 		}
-		HFSPLUS_I(inode).dev = id;
-		cnid = HFSPLUS_SB(sb).next_cnid++;
+		HFSPLUS_I(inode)->linkid = id;
+		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
 		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
 		if (res)
 			/* panic? */
-			return res;
-		HFSPLUS_SB(sb).file_count++;
+			goto out;
+		sbi->file_count++;
 	}
-	cnid = HFSPLUS_SB(sb).next_cnid++;
+	cnid = sbi->next_cnid++;
 	res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
 	if (res)
-		return res;
+		goto out;
 
 	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	HFSPLUS_SB(sb).file_count++;
-	sb->s_dirt = 1;
-
-	return 0;
+	sbi->file_count++;
+	dst_dir->i_sb->s_dirt = 1;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct super_block *sb = dir->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode = dentry->d_inode;
 	struct qstr str;
 	char name[32];
@@ -322,21 +308,24 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 	if (HFSPLUS_IS_RSRC(inode))
 		return -EPERM;
 
+	mutex_lock(&sbi->vh_mutex);
 	cnid = (u32)(unsigned long)dentry->d_fsdata;
 	if (inode->i_ino == cnid &&
-	    atomic_read(&HFSPLUS_I(inode).opencnt)) {
+	    atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 		str.name = name;
 		str.len = sprintf(name, "temp%lu", inode->i_ino);
 		res = hfsplus_rename_cat(inode->i_ino,
 					 dir, &dentry->d_name,
-					 HFSPLUS_SB(sb).hidden_dir, &str);
-		if (!res)
+					 sbi->hidden_dir, &str);
+		if (!res) {
 			inode->i_flags |= S_DEAD;
-		return res;
+			drop_nlink(inode);
+		}
+		goto out;
 	}
 	res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 
 	if (inode->i_nlink > 0)
 		drop_nlink(inode);
@@ -344,10 +333,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		clear_nlink(inode);
 	if (!inode->i_nlink) {
 		if (inode->i_ino != cnid) {
-			HFSPLUS_SB(sb).file_count--;
-			if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+			sbi->file_count--;
+			if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
 				res = hfsplus_delete_cat(inode->i_ino,
-							 HFSPLUS_SB(sb).hidden_dir,
+							 sbi->hidden_dir,
 							 NULL);
 				if (!res)
 					hfsplus_delete_inode(inode);
@@ -356,107 +345,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		} else
 			hfsplus_delete_inode(inode);
 	} else
-		HFSPLUS_SB(sb).file_count--;
+		sbi->file_count--;
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct inode *inode;
-	int res;
-
-	inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-	if (!inode)
-		return -ENOSPC;
-
-	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
-	hfsplus_instantiate(dentry, inode, inode->i_ino);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
 static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
 	int res;
 
-	inode = dentry->d_inode;
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
+
+	mutex_lock(&sbi->vh_mutex);
 	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
 	if (res)
-		return res;
+		goto out;
 	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	hfsplus_delete_inode(inode);
 	mark_inode_dirty(inode);
-	return 0;
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
 }
 
 static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *symname)
 {
-	struct super_block *sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
 
 	res = page_symlink(inode, symname, strlen(symname) + 1);
-	if (res) {
-		inode->i_nlink = 0;
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		return res;
-	}
+	if (res)
+		goto out_err;
 
-	mark_inode_dirty(inode);
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+	if (res)
+		goto out_err;
 
-	if (!res) {
-		hfsplus_instantiate(dentry, inode, inode->i_ino);
-		mark_inode_dirty(inode);
-	}
+	hfsplus_instantiate(dentry, inode, inode->i_ino);
+	mark_inode_dirty(inode);
+	goto out;
 
+out_err:
+	inode->i_nlink = 0;
+	hfsplus_delete_inode(inode);
+	iput(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
 	return res;
 }
 
 static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t rdev)
 {
-	struct super_block *sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
-	int res;
+	int res = -ENOSPC;
 
-	sb = dir->i_sb;
-	inode = hfsplus_new_inode(sb, mode);
+	mutex_lock(&sbi->vh_mutex);
+	inode = hfsplus_new_inode(dir->i_sb, mode);
 	if (!inode)
-		return -ENOSPC;
+		goto out;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+		init_special_inode(inode, mode, rdev);
 
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
 	if (res) {
 		inode->i_nlink = 0;
 		hfsplus_delete_inode(inode);
 		iput(inode);
-		return res;
+		goto out;
 	}
-	init_special_inode(inode, mode, rdev);
+
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
+out:
+	mutex_unlock(&sbi->vh_mutex);
+	return res;
+}
 
-	return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+			  struct nameidata *nd)
+{
+	return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
 }
 
 static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +456,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		res = hfsplus_unlink(new_dir, new_dentry);
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
+			res = hfsplus_rmdir(new_dir, new_dentry);
+		else
+			res = hfsplus_unlink(new_dir, new_dentry);
 		if (res)
 			return res;
 	}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cd..0c9cb1820a5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
 
 static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
-			      HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+			      HFSPLUS_IS_RSRC(inode) ?
+				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
 	res = hfs_brec_find(fd);
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+	if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
 		if (res != -ENOENT)
 			return;
-		hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hfs_brec_insert(fd, hip->cached_extents,
+				sizeof(hfsplus_extent_rec));
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	} else {
 		if (res)
 			return;
-		hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
-		HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+		hfs_bnode_write(fd->bnode, hip->cached_extents,
+				fd->entryoffset, fd->entrylength);
+		hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
 	}
 }
 
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+	if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
 		struct hfs_find_data fd;
 
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 		__hfsplus_ext_write_extent(inode, &fd);
 		hfs_find_exit(&fd);
 	}
 }
 
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+	mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+	hfsplus_ext_write_extent_locked(inode);
+	mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
 static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 					    struct hfsplus_extent *extent,
 					    u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 
 static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
 
-	if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+	WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+	if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
 		__hfsplus_ext_write_extent(inode, fd);
 
-	res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
-					block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+					block, HFSPLUS_IS_RSRC(inode) ?
+						HFSPLUS_TYPE_RSRC :
+						HFSPLUS_TYPE_DATA);
 	if (!res) {
-		HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
-		HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+		hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
 	} else {
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 	}
 	return res;
 }
 
 static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 {
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	int res;
 
-	if (block >= HFSPLUS_I(inode).cached_start &&
-	    block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+	if (block >= hip->cached_start &&
+	    block < hip->cached_start + hip->cached_blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 	res = __hfsplus_ext_cache_extent(&fd, inode, block);
 	hfs_find_exit(&fd);
 	return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
 int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		      struct buffer_head *bh_result, int create)
 {
-	struct super_block *sb;
+	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res = -EIO;
 	u32 ablock, dblock, mask;
 	int shift;
 
-	sb = inode->i_sb;
-
 	/* Convert inode block to disk allocation block */
-	shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
-	ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+	shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+	ablock = iblock >> sbi->fs_shift;
 
-	if (iblock >= HFSPLUS_I(inode).fs_blocks) {
-		if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+	if (iblock >= hip->fs_blocks) {
+		if (iblock > hip->fs_blocks || !create)
 			return -EIO;
-		if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+		if (ablock >= hip->alloc_blocks) {
 			res = hfsplus_file_extend(inode);
 			if (res)
 				return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	} else
 		create = 0;
 
-	if (ablock < HFSPLUS_I(inode).first_blocks) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+	if (ablock < hip->first_blocks) {
+		dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
 		goto done;
 	}
 
 	if (inode->i_ino == HFSPLUS_EXT_CNID)
 		return -EIO;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&hip->extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
-		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
-					     HFSPLUS_I(inode).cached_start);
+		dblock = hfsplus_ext_find_block(hip->cached_extents,
+						ablock - hip->cached_start);
 	} else {
-		mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+		mutex_unlock(&hip->extents_lock);
 		return -EIO;
 	}
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
-	mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
-	map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+	mask = (1 << sbi->fs_shift) - 1;
+	map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
 	if (create) {
 		set_buffer_new(bh_result);
-		HFSPLUS_I(inode).phys_size += sb->s_blocksize;
-		HFSPLUS_I(inode).fs_blocks++;
+		hip->phys_size += sb->s_blocksize;
+		hip->fs_blocks++;
 		inode_add_bytes(inode, sb->s_blocksize);
 		mark_inode_dirty(inode);
 	}
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 	if (total_blocks == blocks)
 		return 0;
 
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	do {
 		res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
 						total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
 int hfsplus_file_extend(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 start, len, goal;
 	int res;
 
-	if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
 		// extend alloc file
-		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
-			HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+				sbi->alloc_file->i_size * 8,
+				sbi->total_blocks, sbi->free_blocks);
 		return -ENOSPC;
 	}
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+	mutex_lock(&hip->extents_lock);
+	if (hip->alloc_blocks == hip->first_blocks)
+		goal = hfsplus_ext_lastblock(hip->first_extents);
 	else {
-		res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+		res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
 		if (res)
 			goto out;
-		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+		goal = hfsplus_ext_lastblock(hip->cached_extents);
 	}
 
-	len = HFSPLUS_I(inode).clump_blocks;
-	start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
-	if (start >= HFSPLUS_SB(sb).total_blocks) {
+	len = hip->clump_blocks;
+	start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+	if (start >= sbi->total_blocks) {
 		start = hfsplus_block_allocate(sb, goal, 0, &len);
 		if (start >= goal) {
 			res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
 	}
 
 	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-	if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-		if (!HFSPLUS_I(inode).first_blocks) {
+
+	if (hip->alloc_blocks <= hip->first_blocks) {
+		if (!hip->first_blocks) {
 			dprint(DBG_EXTENT, "first extents\n");
 			/* no extents yet */
-			HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
-			HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+			hip->first_extents[0].start_block = cpu_to_be32(start);
+			hip->first_extents[0].block_count = cpu_to_be32(len);
 			res = 0;
 		} else {
 			/* try to append to extents in inode */
-			res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
-						 HFSPLUS_I(inode).alloc_blocks,
+			res = hfsplus_add_extent(hip->first_extents,
+						 hip->alloc_blocks,
 						 start, len);
 			if (res == -ENOSPC)
 				goto insert_extent;
 		}
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks += len;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks += len;
 		}
 	} else {
-		res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
-					 HFSPLUS_I(inode).alloc_blocks -
-					 HFSPLUS_I(inode).cached_start,
+		res = hfsplus_add_extent(hip->cached_extents,
+					 hip->alloc_blocks - hip->cached_start,
 					 start, len);
 		if (!res) {
-			hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
-			HFSPLUS_I(inode).cached_blocks += len;
+			hfsplus_dump_extent(hip->cached_extents);
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->cached_blocks += len;
 		} else if (res == -ENOSPC)
 			goto insert_extent;
 	}
 out:
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 	if (!res) {
-		HFSPLUS_I(inode).alloc_blocks += len;
+		hip->alloc_blocks += len;
 		mark_inode_dirty(inode);
 	}
 	return res;
 
 insert_extent:
 	dprint(DBG_EXTENT, "insert new extent\n");
-	hfsplus_ext_write_extent(inode);
+	hfsplus_ext_write_extent_locked(inode);
 
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
-	HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
-	hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-	HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
-	HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
-	HFSPLUS_I(inode).cached_blocks = len;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_extents[0].start_block = cpu_to_be32(start);
+	hip->cached_extents[0].block_count = cpu_to_be32(len);
+	hfsplus_dump_extent(hip->cached_extents);
+	hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+	hip->cached_start = hip->alloc_blocks;
+	hip->cached_blocks = len;
 
 	res = 0;
 	goto out;
@@ -437,13 +461,15 @@ insert_extent:
 void hfsplus_file_truncate(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfs_find_data fd;
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-	       (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
-	if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+		inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+	if (inode->i_size > hip->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
 			return;
 		mark_inode_dirty(inode);
 		return;
-	} else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+	} else if (inode->i_size == hip->phys_size)
 		return;
 
-	blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+	blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+			HFSPLUS_SB(sb)->alloc_blksz_shift;
+	alloc_cnt = hip->alloc_blocks;
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	mutex_lock(&HFSPLUS_I(inode).extents_lock);
-	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+	mutex_lock(&hip->extents_lock);
+	hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
 	while (1) {
-		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
-			hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+		if (alloc_cnt == hip->first_blocks) {
+			hfsplus_free_extents(sb, hip->first_extents,
 					     alloc_cnt, alloc_cnt - blk_cnt);
-			hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-			HFSPLUS_I(inode).first_blocks = blk_cnt;
+			hfsplus_dump_extent(hip->first_extents);
+			hip->first_blocks = blk_cnt;
 			break;
 		}
 		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
 		if (res)
 			break;
-		start = HFSPLUS_I(inode).cached_start;
-		hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+		start = hip->cached_start;
+		hfsplus_free_extents(sb, hip->cached_extents,
 				     alloc_cnt - start, alloc_cnt - blk_cnt);
-		hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+		hfsplus_dump_extent(hip->cached_extents);
 		if (blk_cnt > start) {
-			HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
 			break;
 		}
 		alloc_cnt = start;
-		HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-		HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->cached_start = hip->cached_blocks = 0;
+		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&hip->extents_lock);
 
-	HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+	hip->alloc_blocks = blk_cnt;
 out:
-	HFSPLUS_I(inode).phys_size = inode->i_size;
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+	hip->phys_size = inode->i_size;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
 	mark_inode_dirty(inode);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 6505c30ad96..cb3653efb57 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
 	unsigned int depth;
 
 	//unsigned int map1_size, map_size;
-	struct semaphore tree_lock;
+	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
 	spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
 	u32 sect_count;
 	int fs_shift;
 
-	/* Stuff in host order from Vol Header */
+	/* immutable data from the volume header */
 	u32 alloc_blksz;
 	int alloc_blksz_shift;
 	u32 total_blocks;
+	u32 data_clump_blocks, rsrc_clump_blocks;
+
+	/* mutable data from the volume header, protected by alloc_mutex */
 	u32 free_blocks;
-	u32 next_alloc;
+	struct mutex alloc_mutex;
+
+	/* mutable data from the volume header, protected by vh_mutex */
 	u32 next_cnid;
 	u32 file_count;
 	u32 folder_count;
-	u32 data_clump_blocks, rsrc_clump_blocks;
+	struct mutex vh_mutex;
 
 	/* Config options */
 	u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
 	int part, session;
 
 	unsigned long flags;
-
-	struct hlist_head rsrc_inodes;
 };
 
-#define HFSPLUS_SB_WRITEBACKUP	0x0001
-#define HFSPLUS_SB_NODECOMPOSE	0x0002
-#define HFSPLUS_SB_FORCE	0x0004
-#define HFSPLUS_SB_HFSX		0x0008
-#define HFSPLUS_SB_CASEFOLD	0x0010
+#define HFSPLUS_SB_WRITEBACKUP	0
+#define HFSPLUS_SB_NODECOMPOSE	1
+#define HFSPLUS_SB_FORCE	2
+#define HFSPLUS_SB_HFSX		3
+#define HFSPLUS_SB_CASEFOLD	4
 
 
 struct hfsplus_inode_info {
-	struct mutex extents_lock;
-	u32 clump_blocks, alloc_blocks;
-	sector_t fs_blocks;
-	/* Allocation extents from catalog record or volume header */
-	hfsplus_extent_rec first_extents;
-	u32 first_blocks;
-	hfsplus_extent_rec cached_extents;
-	u32 cached_start, cached_blocks;
 	atomic_t opencnt;
 
-	struct inode *rsrc_inode;
+	/*
+	 * Extent allocation information, protected by extents_lock.
+	 */
+	u32 first_blocks;
+	u32 clump_blocks;
+	u32 alloc_blocks;
+	u32 cached_start;
+	u32 cached_blocks;
+	hfsplus_extent_rec first_extents;
+	hfsplus_extent_rec cached_extents;
 	unsigned long flags;
+	struct mutex extents_lock;
 
+	/*
+	 * Immutable data.
+	 */
+	struct inode *rsrc_inode;
 	__be32 create_date;
-	/* Device number in hfsplus_permissions in catalog */
-	u32 dev;
-	/* BSD system and user file flags */
-	u8 rootflags;
-	u8 userflags;
 
+	/*
+	 * Protected by sbi->vh_mutex.
+	 */
+	u32 linkid;
+
+	/*
+	 * Protected by i_mutex.
+	 */
+	sector_t fs_blocks;
+	u8 userflags;		/* BSD user file flags */
 	struct list_head open_dir_list;
 	loff_t phys_size;
+
 	struct inode vfs_inode;
 };
 
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
 #define HFSPLUS_FLG_EXT_DIRTY	0x0002
 #define HFSPLUS_FLG_EXT_NEW	0x0004
 
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
 
 struct hfs_find_data {
 	/* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
 int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
 		       struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
 
 /* dir.c */
 extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -351,6 +367,7 @@ int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 
 /* super.c */
 struct inode *hfsplus_iget(struct super_block *, unsigned long);
+int hfsplus_sync_fs(struct super_block *sb, int wait);
 
 /* tables.c */
 extern u16 hfsplus_case_fold_table[];
@@ -371,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* access macros */
-/*
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
+
 static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 {
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
-*/
-#define HFSPLUS_SB(super)	(*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode)	(*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-
-#if 1
-#define hfsplus_kmap(p)		({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)	({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)		kmap(p)
-#define hfsplus_kunmap(p)	kunmap(p)
-#endif
 
 #define sb_bread512(sb, sec, data) ({			\
 	struct buffer_head *__bh;			\
@@ -418,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 #define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
 #define hfsp_now2mt()		__hfsp_ut2mt(get_seconds())
 
-#define kdev_t_to_nr(x)		(x)
-
 #endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61..6892899fd6f 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
 	struct hfsplus_unistr name;
 } __packed;
 
+#define HFSPLUS_CAT_KEYLEN	(sizeof(struct hfsplus_cat_key))
 
 /* Structs from hfs.h */
 struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
 	__be32 start_block;
 } __packed;
 
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
 
 /* HFS+ generic BTree key */
 typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 9bbb82924a2..8afd7e84f98 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -31,10 +31,19 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfsplus_get_block,
-				&HFSPLUS_I(mapping->host).phys_size);
+				&HFSPLUS_I(mapping->host)->phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
@@ -53,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 
 	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
-		tree = HFSPLUS_SB(sb).ext_tree;
+		tree = HFSPLUS_SB(sb)->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		tree = HFSPLUS_SB(sb).cat_tree;
+		tree = HFSPLUS_SB(sb)->cat_tree;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		tree = HFSPLUS_SB(sb).attr_tree;
+		tree = HFSPLUS_SB(sb)->attr_tree;
 		break;
 	default:
 		BUG();
@@ -105,9 +114,24 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfsplus_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 static int hfsplus_writepages(struct address_space *mapping,
@@ -148,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
+	struct hfsplus_inode_info *hip;
 	int err;
 
 	if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
 		goto out;
 
-	inode = HFSPLUS_I(dir).rsrc_inode;
+	inode = HFSPLUS_I(dir)->rsrc_inode;
 	if (inode)
 		goto out;
 
@@ -161,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	hip = HFSPLUS_I(inode);
 	inode->i_ino = dir->i_ino;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	hip->flags = HFSPLUS_FLG_RSRC;
 
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
 	if (!err)
 		err = hfsplus_cat_read_inode(inode, &fd);
@@ -175,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 		iput(inode);
 		return ERR_PTR(err);
 	}
-	HFSPLUS_I(inode).rsrc_inode = dir;
-	HFSPLUS_I(dir).rsrc_inode = inode;
+	hip->rsrc_inode = dir;
+	HFSPLUS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+
+	/*
+	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+	 * want resource fork inodes in the regular inode space, we make them
+	 * appear hashed, but do not put on any lists.  hlist_del()
+	 * will work fine and require no locking.
+	 */
+	hlist_add_fake(&inode->i_hash);
+
 	mark_inode_dirty(inode);
 out:
 	d_add(dentry, inode);
@@ -187,30 +221,27 @@ out:
 
 static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
 {
-	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	u16 mode;
 
 	mode = be16_to_cpu(perms->mode);
 
 	inode->i_uid = be32_to_cpu(perms->owner);
 	if (!inode->i_uid && !mode)
-		inode->i_uid = HFSPLUS_SB(sb).uid;
+		inode->i_uid = sbi->uid;
 
 	inode->i_gid = be32_to_cpu(perms->group);
 	if (!inode->i_gid && !mode)
-		inode->i_gid = HFSPLUS_SB(sb).gid;
+		inode->i_gid = sbi->gid;
 
 	if (dir) {
-		mode = mode ? (mode & S_IALLUGO) :
-			(S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+		mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
 		mode |= S_IFDIR;
 	} else if (!mode)
-		mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
-			~(HFSPLUS_SB(sb).umask));
+		mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
 	inode->i_mode = mode;
 
-	HFSPLUS_I(inode).rootflags = perms->rootflags;
-	HFSPLUS_I(inode).userflags = perms->userflags;
+	HFSPLUS_I(inode)->userflags = perms->userflags;
 	if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
@@ -221,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
 		inode->i_flags &= ~S_APPEND;
 }
 
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-	if (inode->i_flags & S_IMMUTABLE)
-		perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-	if (inode->i_flags & S_APPEND)
-		perms->rootflags |= HFSPLUS_FLG_APPEND;
-	else
-		perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-	perms->userflags = HFSPLUS_I(inode).userflags;
-	perms->mode = cpu_to_be16(inode->i_mode);
-	perms->owner = cpu_to_be32(inode->i_uid);
-	perms->group = cpu_to_be32(inode->i_gid);
-	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
+		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EOVERFLOW;
-	atomic_inc(&HFSPLUS_I(inode).opencnt);
+	atomic_inc(&HFSPLUS_I(inode)->opencnt);
 	return 0;
 }
 
@@ -253,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	struct super_block *sb = inode->i_sb;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+		inode = HFSPLUS_I(inode)->rsrc_inode;
+	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
-			hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+			hfsplus_delete_cat(inode->i_ino,
+					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
 		mutex_unlock(&inode->i_mutex);
@@ -266,9 +281,56 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
+static int hfsplus_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	if (sb->s_dirt) {
+		if (!(sb->s_flags & MS_RDONLY))
+			hfsplus_sync_fs(sb, 1);
+		else
+			sb->s_dirt = 0;
+	}
+
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
+	.setattr	= hfsplus_setattr,
 	.setxattr	= hfsplus_setxattr,
 	.getxattr	= hfsplus_getxattr,
 	.listxattr	= hfsplus_listxattr,
@@ -282,7 +344,7 @@ static const struct file_operations hfsplus_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.fsync		= file_fsync,
+	.fsync		= hfsplus_file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
 	.unlocked_ioctl = hfsplus_ioctl,
@@ -290,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
 
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
+	struct hfsplus_inode_info *hip;
+
 	if (!inode)
 		return NULL;
 
-	inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+	inode->i_ino = sbi->next_cnid++;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-	HFSPLUS_I(inode).flags = 0;
-	memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).alloc_blocks = 0;
-	HFSPLUS_I(inode).first_blocks = 0;
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-	HFSPLUS_I(inode).phys_size = 0;
-	HFSPLUS_I(inode).fs_blocks = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
+
+	hip = HFSPLUS_I(inode);
+	INIT_LIST_HEAD(&hip->open_dir_list);
+	mutex_init(&hip->extents_lock);
+	atomic_set(&hip->opencnt, 0);
+	hip->flags = 0;
+	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->alloc_blocks = 0;
+	hip->first_blocks = 0;
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+	hip->phys_size = 0;
+	hip->fs_blocks = 0;
+	hip->rsrc_inode = NULL;
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_size = 2;
-		HFSPLUS_SB(sb).folder_count++;
+		sbi->folder_count++;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (S_ISREG(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &hfsplus_file_inode_operations;
 		inode->i_fop = &hfsplus_file_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+		hip->clump_blocks = sbi->data_clump_blocks;
 	} else if (S_ISLNK(inode->i_mode)) {
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &hfsplus_aops;
-		HFSPLUS_I(inode).clump_blocks = 1;
+		hip->clump_blocks = 1;
 	} else
-		HFSPLUS_SB(sb).file_count++;
+		sbi->file_count++;
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 	sb->s_dirt = 1;
@@ -343,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 
 	if (S_ISDIR(inode->i_mode)) {
-		HFSPLUS_SB(sb).folder_count--;
+		HFSPLUS_SB(sb)->folder_count--;
 		sb->s_dirt = 1;
 		return;
 	}
-	HFSPLUS_SB(sb).file_count--;
+	HFSPLUS_SB(sb)->file_count--;
 	if (S_ISREG(inode->i_mode)) {
 		if (!inode->i_nlink) {
 			inode->i_size = 0;
@@ -363,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
 	struct super_block *sb = inode->i_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	u32 count;
 	int i;
 
-	memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
-	       sizeof(hfsplus_extent_rec));
+	memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
 	for (count = 0, i = 0; i < 8; i++)
 		count += be32_to_cpu(fork->extents[i].block_count);
-	HFSPLUS_I(inode).first_blocks = count;
-	memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-	HFSPLUS_I(inode).cached_start = 0;
-	HFSPLUS_I(inode).cached_blocks = 0;
-
-	HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
-	inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
-	HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-	inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-	HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_I(inode).clump_blocks)
-		HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
-				HFSPLUS_SB(sb).data_clump_blocks;
+	hip->first_blocks = count;
+	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+	hip->cached_start = 0;
+	hip->cached_blocks = 0;
+
+	hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+	hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+	hip->fs_blocks =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+	hip->clump_blocks =
+		be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+	if (!hip->clump_blocks) {
+		hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+			sbi->rsrc_clump_blocks :
+			sbi->data_clump_blocks;
+	}
 }
 
 void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 {
-	memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
 	       sizeof(hfsplus_extent_rec));
 	fork->total_size = cpu_to_be64(inode->i_size);
-	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+	fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
 }
 
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -401,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
-	HFSPLUS_I(inode).dev = 0;
+	HFSPLUS_I(inode)->linkid = 0;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
@@ -415,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(folder->access_date);
 		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = folder->create_date;
-		HFSPLUS_I(inode).fs_blocks = 0;
+		HFSPLUS_I(inode)->create_date = folder->create_date;
+		HFSPLUS_I(inode)->fs_blocks = 0;
 		inode->i_op = &hfsplus_dir_inode_operations;
 		inode->i_fop = &hfsplus_dir_operations;
 	} else if (type == HFSPLUS_FILE) {
@@ -447,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_atime = hfsp_mt2ut(file->access_date);
 		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
 		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-		HFSPLUS_I(inode).create_date = file->create_date;
+		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
 		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
 		res = -EIO;
@@ -462,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
 	hfsplus_cat_entry entry;
 
 	if (HFSPLUS_IS_RSRC(inode))
-		main_inode = HFSPLUS_I(inode).rsrc_inode;
+		main_inode = HFSPLUS_I(inode)->rsrc_inode;
 
 	if (!main_inode->i_nlink)
 		return 0;
 
-	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+	if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
 		/* panic? */
 		return -EIO;
 
@@ -483,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
-		hfsplus_set_perms(inode, &folder->permissions);
+		hfsplus_cat_set_perms(inode, &folder->permissions);
 		folder->access_date = hfsp_ut2mt(inode->i_atime);
 		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
 		folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -505,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
-		if (S_ISREG(inode->i_mode))
-			HFSPLUS_I(inode).dev = inode->i_nlink;
-		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
-		hfsplus_set_perms(inode, &file->permissions);
+		hfsplus_cat_set_perms(inode, &file->permissions);
 		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f09902..40a85a3ded6 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+	unsigned int flags = 0;
+
+	if (inode->i_flags & S_IMMUTABLE)
+		flags |= FS_IMMUTABLE_FL;
+	if (inode->i_flags |= S_APPEND)
+		flags |= FS_APPEND_FL;
+	if (hip->userflags & HFSPLUS_FLG_NODUMP)
+		flags |= FS_NODUMP_FL;
+
+	return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags;
+	int err = 0;
 
-	lock_kernel();
-	switch (cmd) {
-	case HFSPLUS_IOC_EXT2_GETFLAGS:
-		flags = 0;
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-			flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-			flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-		if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-			flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-		return put_user(flags, (int __user *)arg);
-	case HFSPLUS_IOC_EXT2_SETFLAGS: {
-		int err = 0;
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err) {
-			unlock_kernel();
-			return err;
-		}
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out;
 
-		if (!is_owner_or_cap(inode)) {
-			err = -EACCES;
-			goto setflags_out;
-		}
-		if (get_user(flags, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setflags_out;
-		}
-		if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-		    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				err = -EPERM;
-				goto setflags_out;
-			}
-		}
+	if (!is_owner_or_cap(inode)) {
+		err = -EACCES;
+		goto out_drop_write;
+	}
 
-		/* don't silently ignore unsupported ext2 flags */
-		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-			err = -EOPNOTSUPP;
-			goto setflags_out;
-		}
-		if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
-			inode->i_flags |= S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
-		} else {
-			inode->i_flags &= ~S_IMMUTABLE;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-		}
-		if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-			inode->i_flags |= S_APPEND;
-			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-		} else {
-			inode->i_flags &= ~S_APPEND;
-			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+	if (get_user(flags, user_flags)) {
+		err = -EFAULT;
+		goto out_drop_write;
+	}
+
+	mutex_lock(&inode->i_mutex);
+
+	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock_inode;
 		}
-		if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-			HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-		else
-			HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-setflags_out:
-		mnt_drop_write(filp->f_path.mnt);
-		unlock_kernel();
-		return err;
 	}
+
+	/* don't silently ignore unsupported ext2 flags */
+	if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+		err = -EOPNOTSUPP;
+		goto out_unlock_inode;
+	}
+
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+
+	if (flags & FS_NODUMP_FL)
+		hip->userflags |= HFSPLUS_FLG_NODUMP;
+	else
+		hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case HFSPLUS_IOC_EXT2_GETFLAGS:
+		return hfsplus_ioctl_getflags(file, argp);
+	case HFSPLUS_IOC_EXT2_SETFLAGS:
+		return hfsplus_ioctl_setflags(file, argp);
 	default:
-		unlock_kernel();
 		return -ENOTTY;
 	}
 }
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
 		return -EOPNOTSUPP;
 
-	res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 	if (res)
 		return res;
 	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		return -EOPNOTSUPP;
 
 	if (size) {
-		res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
 		if (res)
 			return res;
 		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 		} else
 			res = size ? -ERANGE : 4;
 	} else
-		res = -ENODATA;
+		res = -EOPNOTSUPP;
 out:
 	if (size)
 		hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07..f9ab276a4d8 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			kfree(p);
 			break;
 		case opt_decompose:
-			sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+			clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_nodecompose:
-			sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
 		case opt_force:
-			sbi->flags |= HFSPLUS_SB_FORCE;
+			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
 			break;
 		default:
 			return 0;
@@ -171,7 +171,7 @@ done:
 
 int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 {
-	struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",session=%u", sbi->session);
 	if (sbi->nls)
 		seq_printf(seq, ",nls=%s", sbi->nls->charset);
-	if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
 		seq_printf(seq, ",nodecompose");
 	return 0;
 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd029..208b16c645c 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
 int hfs_part_find(struct super_block *sb,
 		  sector_t *part_start, sector_t *part_size)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	__be16 *data;
 	int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
 		for (i = 0; i < size; p++, i++) {
 			if (p->pdStart && p->pdSize &&
 			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(p->pdStart);
 				*part_size = be32_to_cpu(p->pdSize);
 				res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
 		size = be32_to_cpu(pm->pmMapBlkCnt);
 		for (i = 0; i < size;) {
 			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-			    (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+			    (sbi->part < 0 || sbi->part == i)) {
 				*part_start += be32_to_cpu(pm->pmPyPartStart);
 				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
 				res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 74b473a8ef9..52cc746d3ba 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/nls.h>
 
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
 
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
 {
-	struct hfs_find_data fd;
-	struct hfsplus_vh *vhdr;
-	struct inode *inode;
-	long err = -EIO;
-
-	inode = iget_locked(sb, ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
+	struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
 
-	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	mutex_init(&HFSPLUS_I(inode).extents_lock);
-	HFSPLUS_I(inode).flags = 0;
-	HFSPLUS_I(inode).rsrc_inode = NULL;
-	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-	read_inode:
-		hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
-		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-		if (!err)
-			err = hfsplus_cat_read_inode(inode, &fd);
-		hfs_find_exit(&fd);
-		if (err)
-			goto bad_inode;
-		goto done;
-	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
-	switch(inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		goto read_inode;
+	switch (inode->i_ino) {
 	case HFSPLUS_EXT_CNID:
 		hfsplus_inode_read_fork(inode, &vhdr->ext_file);
 		inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,125 +45,156 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 		inode->i_mapping->a_ops = &hfsplus_btree_aops;
 		break;
 	default:
-		goto bad_inode;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+	struct hfs_find_data fd;
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->rsrc_inode = NULL;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID) {
+		hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (!err)
+			err = hfsplus_cat_read_inode(inode, &fd);
+		hfs_find_exit(&fd);
+	} else {
+		err = hfsplus_system_read_inode(inode);
+	}
+
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
 	}
 
-done:
 	unlock_new_inode(inode);
 	return inode;
-
-bad_inode:
-	iget_failed(inode);
-	return ERR_PTR(err);
 }
 
-static int hfsplus_write_inode(struct inode *inode,
-		struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
 {
-	struct hfsplus_vh *vhdr;
-	int ret = 0;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	struct hfsplus_fork_raw *fork;
+	struct hfs_btree *tree = NULL;
 
-	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-	hfsplus_ext_write_extent(inode);
-	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-		return hfsplus_cat_write_inode(inode);
-	}
-	vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
 	switch (inode->i_ino) {
-	case HFSPLUS_ROOT_CNID:
-		ret = hfsplus_cat_write_inode(inode);
-		break;
 	case HFSPLUS_EXT_CNID:
-		if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+		fork = &vhdr->ext_file;
+		tree = sbi->ext_tree;
 		break;
 	case HFSPLUS_CAT_CNID:
-		if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+		fork = &vhdr->cat_file;
+		tree = sbi->cat_tree;
 		break;
 	case HFSPLUS_ALLOC_CNID:
-		if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+		fork = &vhdr->alloc_file;
 		break;
 	case HFSPLUS_START_CNID:
-		if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->start_file);
+		fork = &vhdr->start_file;
 		break;
 	case HFSPLUS_ATTR_CNID:
-		if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-			HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-			inode->i_sb->s_dirt = 1;
-		}
-		hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-		hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
-		break;
+		fork = &vhdr->attr_file;
+		tree = sbi->attr_tree;
+	default:
+		return -EIO;
+	}
+
+	if (fork->total_size != cpu_to_be64(inode->i_size)) {
+		set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+		inode->i_sb->s_dirt = 1;
 	}
-	return ret;
+	hfsplus_inode_write_fork(inode, fork);
+	if (tree)
+		hfs_btree_write(tree);
+	return 0;
 }
 
-static void hfsplus_clear_inode(struct inode *inode)
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
 {
-	dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino);
+	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+	hfsplus_ext_write_extent(inode);
+
+	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+	    inode->i_ino == HFSPLUS_ROOT_CNID)
+		return hfsplus_cat_write_inode(inode);
+	else
+		return hfsplus_system_write_inode(inode);
+}
+
+static void hfsplus_evict_inode(struct inode *inode)
+{
+	dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
-		HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
-		iput(HFSPLUS_I(inode).rsrc_inode);
+		HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+		iput(HFSPLUS_I(inode)->rsrc_inode);
 	}
 }
 
-static int hfsplus_sync_fs(struct super_block *sb, int wait)
+int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
-	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
-	lock_super(sb);
+	mutex_lock(&sbi->vh_mutex);
+	mutex_lock(&sbi->alloc_mutex);
 	sb->s_dirt = 0;
 
-	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
-	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
-	vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
-	vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
-	vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
-		if (HFSPLUS_SB(sb).sect_count) {
+	mark_buffer_dirty(sbi->s_vhbh);
+	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+		if (sbi->sect_count) {
 			struct buffer_head *bh;
 			u32 block, offset;
 
-			block = HFSPLUS_SB(sb).blockoffset;
-			block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
-			offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
-			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-				HFSPLUS_SB(sb).sect_count, block, offset);
+			block = sbi->blockoffset;
+			block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+			offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+					  sbi->blockoffset, sbi->sect_count,
+					  block, offset);
 			bh = sb_bread(sb, block);
 			if (bh) {
 				vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
 				if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-					memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+					memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
 					mark_buffer_dirty(bh);
 					brelse(bh);
 				} else
 					printk(KERN_WARNING "hfs: backup not found!\n");
 			}
 		}
-		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
-	unlock_super(sb);
+	mutex_unlock(&sbi->alloc_mutex);
+	mutex_unlock(&sbi->vh_mutex);
 	return 0;
 }
 
@@ -207,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
 
 static void hfsplus_put_super(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
+
 	if (!sb->s_fs_info)
 		return;
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
-	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+		struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 		vhdr->modify_date = hfsp_now2mt();
 		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
 		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-		mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-		sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+		mark_buffer_dirty(sbi->s_vhbh);
+		sync_dirty_buffer(sbi->s_vhbh);
 	}
 
-	hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
-	hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
-	iput(HFSPLUS_SB(sb).alloc_file);
-	iput(HFSPLUS_SB(sb).hidden_dir);
-	brelse(HFSPLUS_SB(sb).s_vhbh);
-	unload_nls(HFSPLUS_SB(sb).nls);
+	hfs_btree_close(sbi->cat_tree);
+	hfs_btree_close(sbi->ext_tree);
+	iput(sbi->alloc_file);
+	iput(sbi->hidden_dir);
+	brelse(sbi->s_vhbh);
+	unload_nls(sbi->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
-	buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+	buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+	buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
 	buf->f_bavail = buf->f_bfree;
 	buf->f_files = 0xFFFFFFFF;
-	buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+	buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -261,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
-		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
 		struct hfsplus_sb_info sbi;
 
 		memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-		sbi.nls = HFSPLUS_SB(sb).nls;
+		sbi.nls = HFSPLUS_SB(sb)->nls;
 		if (!hfsplus_parse_options(data, &sbi))
 			return -EINVAL;
 
@@ -274,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			       "running fsck.hfsplus is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
-		} else if (sbi.flags & HFSPLUS_SB_FORCE) {
+		} else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
 			/* nothing */
 		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -293,7 +294,7 @@ static const struct super_operations hfsplus_sops = {
 	.alloc_inode	= hfsplus_alloc_inode,
 	.destroy_inode	= hfsplus_destroy_inode,
 	.write_inode	= hfsplus_write_inode,
-	.clear_inode	= hfsplus_clear_inode,
+	.evict_inode	= hfsplus_evict_inode,
 	.put_super	= hfsplus_put_super,
 	.write_super	= hfsplus_write_super,
 	.sync_fs	= hfsplus_sync_fs,
@@ -318,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
-	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+	mutex_init(&sbi->alloc_mutex);
+	mutex_init(&sbi->vh_mutex);
 	hfsplus_fill_defaults(sbi);
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -342,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = -EINVAL;
 		goto cleanup;
 	}
-	vhdr = HFSPLUS_SB(sb).s_vhdr;
+	vhdr = sbi->s_vhdr;
 
 	/* Copy parts of the volume header into the superblock */
 	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -351,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "hfs: wrong filesystem version\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
-	HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
-	HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
-	HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
-	HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
-	HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
-	HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).data_clump_blocks)
-		HFSPLUS_SB(sb).data_clump_blocks = 1;
-	HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-	if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
-		HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+	sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+	sbi->file_count = be32_to_cpu(vhdr->file_count);
+	sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+	sbi->data_clump_blocks =
+		be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->data_clump_blocks)
+		sbi->data_clump_blocks = 1;
+	sbi->rsrc_clump_blocks =
+		be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+	if (!sbi->rsrc_clump_blocks)
+		sbi->rsrc_clump_blocks = 1;
 
 	/* Set up operations so we can load metadata */
 	sb->s_op = &hfsplus_sops;
@@ -372,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
 		       "running fsck.hfsplus is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if (sbi->flags & HFSPLUS_SB_FORCE) {
+	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -382,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		       "use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	sbi->flags &= ~HFSPLUS_SB_FORCE;
 
 	/* Load metadata objects (B*Trees) */
-	HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-	if (!HFSPLUS_SB(sb).ext_tree) {
+	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+	if (!sbi->ext_tree) {
 		printk(KERN_ERR "hfs: failed to load extents file\n");
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-	if (!HFSPLUS_SB(sb).cat_tree) {
+	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+	if (!sbi->cat_tree) {
 		printk(KERN_ERR "hfs: failed to load catalog file\n");
 		goto cleanup;
 	}
@@ -402,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = PTR_ERR(inode);
 		goto cleanup;
 	}
-	HFSPLUS_SB(sb).alloc_file = inode;
+	sbi->alloc_file = inode;
 
 	/* Load the root directory */
 	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -421,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
 	str.name = HFSP_HIDDENDIR_NAME;
-	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+	hfs_find_init(sbi->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
@@ -432,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 			err = PTR_ERR(inode);
 			goto cleanup;
 		}
-		HFSPLUS_SB(sb).hidden_dir = inode;
+		sbi->hidden_dir = inode;
 	} else
 		hfs_find_exit(&fd);
 
@@ -447,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	be32_add_cpu(&vhdr->write_count, 1);
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+	mark_buffer_dirty(sbi->s_vhbh);
+	sync_dirty_buffer(sbi->s_vhbh);
 
-	if (!HFSPLUS_SB(sb).hidden_dir) {
+	if (!sbi->hidden_dir) {
 		printk(KERN_DEBUG "hfs: create hidden dir...\n");
-		HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-		hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-				   &str, HFSPLUS_SB(sb).hidden_dir);
-		mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+
+		mutex_lock(&sbi->vh_mutex);
+		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+				   &str, sbi->hidden_dir);
+		mutex_unlock(&sbi->vh_mutex);
+
+		mark_inode_dirty(sbi->hidden_dir);
 	}
 out:
 	unload_nls(sbi->nls);
@@ -484,23 +490,21 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 
 static void hfsplus_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static int hfsplus_get_sb(struct file_system_type *fs_type,
-			  int flags, const char *dev_name, void *data,
-			  struct vfsmount *mnt)
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
 }
 
 static struct file_system_type hfsplus_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hfsplus",
-	.get_sb		= hfsplus_get_sb,
+	.mount		= hfsplus_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa40..b66d67de882 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
 int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
 {
 	const hfsplus_unichr *ip;
-	struct nls_table *nls = HFSPLUS_SB(sb).nls;
+	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
 	u8 *op;
 	u16 cc, c0, c1;
 	u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 	ustrlen = be16_to_cpu(ustr->length);
 	len = *len_p;
 	ce1 = NULL;
-	compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 
 	while (ustrlen > 0) {
 		c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
 			      wchar_t *uc)
 {
-	int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
 	if (size <= 0) {
 		*uc = '?';
 		size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 	u16 *dstr, outlen = 0;
 	wchar_t c;
 
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 	wchar_t c;
 	u16 c2;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	hash = init_name_hash();
 	astr = str->name;
 	len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 	u16 c1, c2;
 	wchar_t c;
 
-	casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-	decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	astr1 = s1->name;
 	len1 = s1->len;
 	astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d..8972c20b321 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
 	*start = 0;
 	*size = sb->s_bdev->bd_inode->i_size >> 9;
 
-	if (HFSPLUS_SB(sb).session >= 0) {
-		te.cdte_track = HFSPLUS_SB(sb).session;
+	if (HFSPLUS_SB(sb)->session >= 0) {
+		te.cdte_track = HFSPLUS_SB(sb)->session;
 		te.cdte_format = CDROM_LBA;
 		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
 		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 /* Takes in super block, returns true if good data read */
 int hfsplus_read_wrapper(struct super_block *sb)
 {
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct buffer_head *bh;
 	struct hfsplus_vh *vhdr;
 	struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
 			break;
 		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-			HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+			set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
 			break;
 		}
 		brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
 	if (blocksize < HFSPLUS_SECTOR_SIZE ||
 	    ((blocksize - 1) & blocksize))
 		return -EINVAL;
-	HFSPLUS_SB(sb).alloc_blksz = blocksize;
-	HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+	sbi->alloc_blksz = blocksize;
+	sbi->alloc_blksz_shift = 0;
 	while ((blocksize >>= 1) != 0)
-		HFSPLUS_SB(sb).alloc_blksz_shift++;
-	blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+		sbi->alloc_blksz_shift++;
+	blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
 
 	/* align block size to block offset */
 	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
 		return -EINVAL;
 	}
 
-	HFSPLUS_SB(sb).blockoffset = part_start >>
-			(sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-	HFSPLUS_SB(sb).sect_count = part_size;
-	HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
-			sb->s_blocksize_bits;
+	sbi->blockoffset =
+		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->sect_count = part_size;
+	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
 
 	bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
 	if (!bh)
 		return -EIO;
 
 	/* should still be the same... */
-	if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-				cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-		goto error;
-	HFSPLUS_SB(sb).s_vhbh = bh;
-	HFSPLUS_SB(sb).s_vhdr = vhdr;
+	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+			goto error;
+	} else {
+		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+			goto error;
+	}
+
+	sbi->s_vhbh = bh;
+	sbi->s_vhdr = vhdr;
 
 	return 0;
  error:
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 2f34f8f2134..bf15a43016b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
  * #define ATTR_KILL_SUID	2048
  * #define ATTR_KILL_SGID	4096
  *
- * and this is because they were added in 2.5 development in this patch:
- *
- * http://linux.bkbits.net:8080/linux-2.5/
- * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
- * |src/.|src/include|src/include/linux|related/include/linux/fs.h
- *
+ * and this is because they were added in 2.5 development.
  * Actually, they are not needed by most ->setattr() methods - they are set by
  * callers of notify_change() to notify that the setuid/setgid bits must be
  * dropped.
@@ -53,18 +48,28 @@ struct hostfs_iattr {
 	struct timespec	ia_ctime;
 };
 
-extern int stat_file(const char *path, unsigned long long *inode_out,
-		     int *mode_out, int *nlink_out, int *uid_out, int *gid_out,
-		     unsigned long long *size_out, struct timespec *atime_out,
-		     struct timespec *mtime_out, struct timespec *ctime_out,
-		     int *blksize_out, unsigned long long *blocks_out, int fd);
+struct hostfs_stat {
+	unsigned long long ino;
+	unsigned int mode;
+	unsigned int nlink;
+	unsigned int uid;
+	unsigned int gid;
+	unsigned long long size;
+	struct timespec atime, mtime, ctime;
+	unsigned int blksize;
+	unsigned long long blocks;
+	unsigned int maj;
+	unsigned int min;
+};
+
+extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
 extern int access_file(char *path, int r, int w, int x);
 extern int open_file(char *path, int r, int w, int append);
-extern int file_type(const char *path, int *maj, int *min);
 extern void *open_dir(char *path, int *err_out);
 extern char *read_dir(void *stream, unsigned long long *pos,
 		      unsigned long long *ino_out, int *len_out);
 extern void close_file(void *stream);
+extern int replace_file(int oldfd, int fd);
 extern void close_dir(void *stream);
 extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
 extern int write_file(int fd, unsigned long long *offset, const char *buf,
@@ -86,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
-		     void *fsid_out, int fsid_size, long *namelen_out,
-		     long *spare_out);
+		     void *fsid_out, int fsid_size, long *namelen_out);
 
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 87ac1891a18..2c0f148a49e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,12 +14,12 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
 #include "hostfs.h"
 #include "init.h"
 #include "kern.h"
 
 struct hostfs_inode_info {
-	char *host_filename;
 	int fd;
 	fmode_t mode;
 	struct inode vfs_inode;
@@ -49,7 +49,7 @@ static int append = 0;
 
 static const struct inode_operations hostfs_iops;
 static const struct inode_operations hostfs_dir_iops;
-static const struct address_space_operations hostfs_link_aops;
+static const struct inode_operations hostfs_link_iops;
 
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -90,71 +90,58 @@ __uml_setup("hostfs=", hostfs_args,
 );
 #endif
 
-static char *dentry_name(struct dentry *dentry, int extra)
+static char *__dentry_name(struct dentry *dentry, char *name)
 {
-	struct dentry *parent;
-	char *root, *name;
-	int len;
-
-	len = 0;
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		len += parent->d_name.len + 1;
-		parent = parent->d_parent;
-	}
+	char *p = __dentry_path(dentry, name, PATH_MAX);
+	char *root;
+	size_t len;
 
-	root = HOSTFS_I(parent->d_inode)->host_filename;
-	len += strlen(root);
-	name = kmalloc(len + extra + 1, GFP_KERNEL);
-	if (name == NULL)
-		return NULL;
+	spin_unlock(&dcache_lock);
 
-	name[len] = '\0';
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		len -= parent->d_name.len + 1;
-		name[len] = '/';
-		strncpy(&name[len + 1], parent->d_name.name,
-			parent->d_name.len);
-		parent = parent->d_parent;
+	root = dentry->d_sb->s_fs_info;
+	len = strlen(root);
+	if (IS_ERR(p)) {
+		__putname(name);
+		return NULL;
+	}
+	strlcpy(name, root, PATH_MAX);
+	if (len > p - name) {
+		__putname(name);
+		return NULL;
+	}
+	if (p > name + len) {
+		char *s = name + len;
+		while ((*s++ = *p++) != '\0')
+			;
 	}
-	strncpy(name, root, strlen(root));
 	return name;
 }
 
-static char *inode_name(struct inode *ino, int extra)
+static char *dentry_name(struct dentry *dentry)
 {
-	struct dentry *dentry;
+	char *name = __getname();
+	if (!name)
+		return NULL;
 
-	dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
-	return dentry_name(dentry, extra);
+	spin_lock(&dcache_lock);
+	return __dentry_name(dentry, name); /* will unlock */
 }
 
-static int read_name(struct inode *ino, char *name)
+static char *inode_name(struct inode *ino)
 {
-	/*
-	 * The non-int inode fields are copied into ints by stat_file and
-	 * then copied into the inode because passing the actual pointers
-	 * in and having them treated as int * breaks on big-endian machines
-	 */
-	int err;
-	int i_mode, i_nlink, i_blksize;
-	unsigned long long i_size;
-	unsigned long long i_ino;
-	unsigned long long i_blocks;
-
-	err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid,
-			&ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime,
-			&ino->i_ctime, &i_blksize, &i_blocks, -1);
-	if (err)
-		return err;
+	struct dentry *dentry;
+	char *name = __getname();
+	if (!name)
+		return NULL;
 
-	ino->i_ino = i_ino;
-	ino->i_mode = i_mode;
-	ino->i_nlink = i_nlink;
-	ino->i_size = i_size;
-	ino->i_blocks = i_blocks;
-	return 0;
+	spin_lock(&dcache_lock);
+	if (list_empty(&ino->i_dentry)) {
+		spin_unlock(&dcache_lock);
+		__putname(name);
+		return NULL;
+	}
+	dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+	return __dentry_name(dentry, name); /* will unlock */
 }
 
 static char *follow_link(char *link)
@@ -205,53 +192,11 @@ static char *follow_link(char *link)
 	return ERR_PTR(n);
 }
 
-static int hostfs_read_inode(struct inode *ino)
-{
-	char *name;
-	int err = 0;
-
-	/*
-	 * Unfortunately, we are called from iget() when we don't have a dentry
-	 * allocated yet.
-	 */
-	if (list_empty(&ino->i_dentry))
-		goto out;
-
-	err = -ENOMEM;
-	name = inode_name(ino, 0);
-	if (name == NULL)
-		goto out;
-
-	if (file_type(name, NULL, NULL) == OS_TYPE_SYMLINK) {
-		name = follow_link(name);
-		if (IS_ERR(name)) {
-			err = PTR_ERR(name);
-			goto out;
-		}
-	}
-
-	err = read_name(ino, name);
-	kfree(name);
- out:
-	return err;
-}
-
 static struct inode *hostfs_iget(struct super_block *sb)
 {
-	struct inode *inode;
-	long ret;
-
-	inode = iget_locked(sb, 0);
+	struct inode *inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
-	if (inode->i_state & I_NEW) {
-		ret = hostfs_read_inode(inode);
-		if (ret < 0) {
-			iget_failed(inode);
-			return ERR_PTR(ret);
-		}
-		unlock_new_inode(inode);
-	}
 	return inode;
 }
 
@@ -269,10 +214,10 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
+	err = do_statfs(dentry->d_sb->s_fs_info,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-			&sf->f_namelen, sf->f_spare);
+			&sf->f_namelen);
 	if (err)
 		return err;
 	sf->f_blocks = f_blocks;
@@ -288,47 +233,32 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kzalloc(sizeof(*hi), GFP_KERNEL);
 	if (hi == NULL)
 		return NULL;
-
-	*hi = ((struct hostfs_inode_info) { .host_filename	= NULL,
-					    .fd			= -1,
-					    .mode		= 0 });
+	hi->fd = -1;
 	inode_init_once(&hi->vfs_inode);
 	return &hi->vfs_inode;
 }
 
-static void hostfs_delete_inode(struct inode *inode)
+static void hostfs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (HOSTFS_I(inode)->fd != -1) {
 		close_file(&HOSTFS_I(inode)->fd);
 		HOSTFS_I(inode)->fd = -1;
 	}
-	clear_inode(inode);
 }
 
 static void hostfs_destroy_inode(struct inode *inode)
 {
-	kfree(HOSTFS_I(inode)->host_filename);
-
-	/*
-	 * XXX: This should not happen, probably. The check is here for
-	 * additional safety.
-	 */
-	if (HOSTFS_I(inode)->fd != -1) {
-		close_file(&HOSTFS_I(inode)->fd);
-		printk(KERN_DEBUG "Closing host fd in .destroy_inode\n");
-	}
-
 	kfree(HOSTFS_I(inode));
 }
 
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-	struct inode *root = vfs->mnt_sb->s_root->d_inode;
-	const char *root_path = HOSTFS_I(root)->host_filename;
+	const char *root_path = vfs->mnt_sb->s_fs_info;
 	size_t offset = strlen(root_ino) + 1;
 
 	if (strlen(root_path) > offset)
@@ -339,9 +269,8 @@ static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 
 static const struct super_operations hostfs_sbops = {
 	.alloc_inode	= hostfs_alloc_inode,
-	.drop_inode	= generic_delete_inode,
-	.delete_inode   = hostfs_delete_inode,
 	.destroy_inode	= hostfs_destroy_inode,
+	.evict_inode	= hostfs_evict_inode,
 	.statfs		= hostfs_statfs,
 	.show_options	= hostfs_show_options,
 };
@@ -353,11 +282,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	unsigned long long next, ino;
 	int error, len;
 
-	name = dentry_name(file->f_path.dentry, 0);
+	name = dentry_name(file->f_path.dentry);
 	if (name == NULL)
 		return -ENOMEM;
 	dir = open_dir(name, &error);
-	kfree(name);
+	__putname(name);
 	if (dir == NULL)
 		return -error;
 	next = file->f_pos;
@@ -373,40 +302,59 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 
 int hostfs_file_open(struct inode *ino, struct file *file)
 {
+	static DEFINE_MUTEX(open_mutex);
 	char *name;
 	fmode_t mode = 0;
+	int err;
 	int r = 0, w = 0, fd;
 
 	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
 	if ((mode & HOSTFS_I(ino)->mode) == mode)
 		return 0;
 
-	/*
-	 * The file may already have been opened, but with the wrong access,
-	 * so this resets things and reopens the file with the new access.
-	 */
-	if (HOSTFS_I(ino)->fd != -1) {
-		close_file(&HOSTFS_I(ino)->fd);
-		HOSTFS_I(ino)->fd = -1;
-	}
+	mode |= HOSTFS_I(ino)->mode;
 
-	HOSTFS_I(ino)->mode |= mode;
-	if (HOSTFS_I(ino)->mode & FMODE_READ)
+retry:
+	if (mode & FMODE_READ)
 		r = 1;
-	if (HOSTFS_I(ino)->mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		w = 1;
 	if (w)
 		r = 1;
 
-	name = dentry_name(file->f_path.dentry, 0);
+	name = dentry_name(file->f_path.dentry);
 	if (name == NULL)
 		return -ENOMEM;
 
 	fd = open_file(name, r, w, append);
-	kfree(name);
+	__putname(name);
 	if (fd < 0)
 		return fd;
-	FILE_HOSTFS_I(file)->fd = fd;
+
+	mutex_lock(&open_mutex);
+	/* somebody else had handled it first? */
+	if ((mode & HOSTFS_I(ino)->mode) == mode) {
+		mutex_unlock(&open_mutex);
+		return 0;
+	}
+	if ((mode | HOSTFS_I(ino)->mode) != mode) {
+		mode |= HOSTFS_I(ino)->mode;
+		mutex_unlock(&open_mutex);
+		close_file(&fd);
+		goto retry;
+	}
+	if (HOSTFS_I(ino)->fd == -1) {
+		HOSTFS_I(ino)->fd = fd;
+	} else {
+		err = replace_file(fd, HOSTFS_I(ino)->fd);
+		close_file(&fd);
+		if (err < 0) {
+			mutex_unlock(&open_mutex);
+			return err;
+		}
+	}
+	HOSTFS_I(ino)->mode = mode;
+	mutex_unlock(&open_mutex);
 
 	return 0;
 }
@@ -544,54 +492,50 @@ static const struct address_space_operations hostfs_aops = {
 	.write_end	= hostfs_write_end,
 };
 
-static int init_inode(struct inode *inode, struct dentry *dentry)
+static int read_name(struct inode *ino, char *name)
 {
-	char *name;
-	int type, err = -ENOMEM;
-	int maj, min;
-	dev_t rdev = 0;
+	dev_t rdev;
+	struct hostfs_stat st;
+	int err = stat_file(name, &st, -1);
+	if (err)
+		return err;
 
-	if (dentry) {
-		name = dentry_name(dentry, 0);
-		if (name == NULL)
-			goto out;
-		type = file_type(name, &maj, &min);
-		/* Reencode maj and min with the kernel encoding.*/
-		rdev = MKDEV(maj, min);
-		kfree(name);
-	}
-	else type = OS_TYPE_DIR;
+	/* Reencode maj and min with the kernel encoding.*/
+	rdev = MKDEV(st.maj, st.min);
 
-	err = 0;
-	if (type == OS_TYPE_SYMLINK)
-		inode->i_op = &page_symlink_inode_operations;
-	else if (type == OS_TYPE_DIR)
-		inode->i_op = &hostfs_dir_iops;
-	else inode->i_op = &hostfs_iops;
-
-	if (type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
-	else inode->i_fop = &hostfs_file_fops;
-
-	if (type == OS_TYPE_SYMLINK)
-		inode->i_mapping->a_ops = &hostfs_link_aops;
-	else inode->i_mapping->a_ops = &hostfs_aops;
-
-	switch (type) {
-	case OS_TYPE_CHARDEV:
-		init_special_inode(inode, S_IFCHR, rdev);
+	switch (st.mode & S_IFMT) {
+	case S_IFLNK:
+		ino->i_op = &hostfs_link_iops;
 		break;
-	case OS_TYPE_BLOCKDEV:
-		init_special_inode(inode, S_IFBLK, rdev);
+	case S_IFDIR:
+		ino->i_op = &hostfs_dir_iops;
+		ino->i_fop = &hostfs_dir_fops;
 		break;
-	case OS_TYPE_FIFO:
-		init_special_inode(inode, S_IFIFO, 0);
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		init_special_inode(ino, st.mode & S_IFMT, rdev);
+		ino->i_op = &hostfs_iops;
 		break;
-	case OS_TYPE_SOCK:
-		init_special_inode(inode, S_IFSOCK, 0);
-		break;
-	}
- out:
-	return err;
+
+	default:
+		ino->i_op = &hostfs_iops;
+		ino->i_fop = &hostfs_file_fops;
+		ino->i_mapping->a_ops = &hostfs_aops;
+	}
+
+	ino->i_ino = st.ino;
+	ino->i_mode = st.mode;
+	ino->i_nlink = st.nlink;
+	ino->i_uid = st.uid;
+	ino->i_gid = st.gid;
+	ino->i_atime = st.atime;
+	ino->i_mtime = st.mtime;
+	ino->i_ctime = st.ctime;
+	ino->i_size = st.size;
+	ino->i_blocks = st.blocks;
+	return 0;
 }
 
 int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -607,12 +551,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		goto out;
 	}
 
-	error = init_inode(inode, dentry);
-	if (error)
-		goto out_put;
-
 	error = -ENOMEM;
-	name = dentry_name(dentry, 0);
+	name = dentry_name(dentry);
 	if (name == NULL)
 		goto out_put;
 
@@ -622,9 +562,10 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
 			 mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
 	if (fd < 0)
 		error = fd;
-	else error = read_name(inode, name);
+	else
+		error = read_name(inode, name);
 
-	kfree(name);
+	__putname(name);
 	if (error)
 		goto out_put;
 
@@ -652,17 +593,14 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 		goto out;
 	}
 
-	err = init_inode(inode, dentry);
-	if (err)
-		goto out_put;
-
 	err = -ENOMEM;
-	name = dentry_name(dentry, 0);
+	name = dentry_name(dentry);
 	if (name == NULL)
 		goto out_put;
 
 	err = read_name(inode, name);
-	kfree(name);
+
+	__putname(name);
 	if (err == -ENOENT) {
 		iput(inode);
 		inode = NULL;
@@ -680,36 +618,21 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 	return ERR_PTR(err);
 }
 
-static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
-{
-	char *file;
-	int len;
-
-	file = inode_name(ino, dentry->d_name.len + 1);
-	if (file == NULL)
-		return NULL;
-	strcat(file, "/");
-	len = strlen(file);
-	strncat(file, dentry->d_name.name, dentry->d_name.len);
-	file[len + dentry->d_name.len] = '\0';
-	return file;
-}
-
 int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
 {
 	char *from_name, *to_name;
 	int err;
 
-	if ((from_name = inode_dentry_name(ino, from)) == NULL)
+	if ((from_name = dentry_name(from)) == NULL)
 		return -ENOMEM;
-	to_name = dentry_name(to, 0);
+	to_name = dentry_name(to);
 	if (to_name == NULL) {
-		kfree(from_name);
+		__putname(from_name);
 		return -ENOMEM;
 	}
 	err = link_file(to_name, from_name);
-	kfree(from_name);
-	kfree(to_name);
+	__putname(from_name);
+	__putname(to_name);
 	return err;
 }
 
@@ -718,13 +641,14 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	if ((file = inode_dentry_name(ino, dentry)) == NULL)
-		return -ENOMEM;
 	if (append)
 		return -EPERM;
 
+	if ((file = dentry_name(dentry)) == NULL)
+		return -ENOMEM;
+
 	err = unlink_file(file);
-	kfree(file);
+	__putname(file);
 	return err;
 }
 
@@ -733,10 +657,10 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
 	char *file;
 	int err;
 
-	if ((file = inode_dentry_name(ino, dentry)) == NULL)
+	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = make_symlink(file, to);
-	kfree(file);
+	__putname(file);
 	return err;
 }
 
@@ -745,10 +669,10 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
 	char *file;
 	int err;
 
-	if ((file = inode_dentry_name(ino, dentry)) == NULL)
+	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = do_mkdir(file, mode);
-	kfree(file);
+	__putname(file);
 	return err;
 }
 
@@ -757,10 +681,10 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	if ((file = inode_dentry_name(ino, dentry)) == NULL)
+	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = do_rmdir(file);
-	kfree(file);
+	__putname(file);
 	return err;
 }
 
@@ -776,22 +700,20 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 		goto out;
 	}
 
-	err = init_inode(inode, dentry);
-	if (err)
-		goto out_put;
-
 	err = -ENOMEM;
-	name = dentry_name(dentry, 0);
+	name = dentry_name(dentry);
 	if (name == NULL)
 		goto out_put;
 
 	init_special_inode(inode, mode, dev);
 	err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
-	if (err)
+	if (!err)
 		goto out_free;
 
 	err = read_name(inode, name);
-	kfree(name);
+	__putname(name);
+	if (err)
+		goto out_put;
 	if (err)
 		goto out_put;
 
@@ -799,7 +721,7 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return 0;
 
  out_free:
-	kfree(name);
+	__putname(name);
  out_put:
 	iput(inode);
  out:
@@ -812,15 +734,15 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	char *from_name, *to_name;
 	int err;
 
-	if ((from_name = inode_dentry_name(from_ino, from)) == NULL)
+	if ((from_name = dentry_name(from)) == NULL)
 		return -ENOMEM;
-	if ((to_name = inode_dentry_name(to_ino, to)) == NULL) {
-		kfree(from_name);
+	if ((to_name = dentry_name(to)) == NULL) {
+		__putname(from_name);
 		return -ENOMEM;
 	}
 	err = rename_file(from_name, to_name);
-	kfree(from_name);
-	kfree(to_name);
+	__putname(from_name);
+	__putname(to_name);
 	return err;
 }
 
@@ -832,7 +754,7 @@ int hostfs_permission(struct inode *ino, int desired)
 	if (desired & MAY_READ) r = 1;
 	if (desired & MAY_WRITE) w = 1;
 	if (desired & MAY_EXEC) x = 1;
-	name = inode_name(ino, 0);
+	name = inode_name(ino);
 	if (name == NULL)
 		return -ENOMEM;
 
@@ -841,7 +763,7 @@ int hostfs_permission(struct inode *ino, int desired)
 		err = 0;
 	else
 		err = access_file(name, r, w, x);
-	kfree(name);
+	__putname(name);
 	if (!err)
 		err = generic_permission(ino, desired, NULL);
 	return err;
@@ -849,13 +771,14 @@ int hostfs_permission(struct inode *ino, int desired)
 
 int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct inode *inode = dentry->d_inode;
 	struct hostfs_iattr attrs;
 	char *name;
 	int err;
 
-	int fd = HOSTFS_I(dentry->d_inode)->fd;
+	int fd = HOSTFS_I(inode)->fd;
 
-	err = inode_change_ok(dentry->d_inode, attr);
+	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
 
@@ -897,15 +820,26 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_MTIME_SET) {
 		attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
 	}
-	name = dentry_name(dentry, 0);
+	name = dentry_name(dentry);
 	if (name == NULL)
 		return -ENOMEM;
 	err = set_attr(name, &attrs, fd);
-	kfree(name);
+	__putname(name);
 	if (err)
 		return err;
 
-	return inode_setattr(dentry->d_inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct inode_operations hostfs_iops = {
@@ -935,32 +869,41 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-int hostfs_link_readpage(struct file *file, struct page *page)
-{
-	char *buffer, *name;
-	int err;
-
-	buffer = kmap(page);
-	name = inode_name(page->mapping->host, 0);
-	if (name == NULL)
-		return -ENOMEM;
-	err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE);
-	kfree(name);
-	if (err == PAGE_CACHE_SIZE)
-		err = -E2BIG;
-	else if (err > 0) {
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-		if (PageError(page)) ClearPageError(page);
-		err = 0;
+static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = __getname();
+	if (link) {
+		char *path = dentry_name(dentry);
+		int err = -ENOMEM;
+		if (path) {
+			err = hostfs_do_readlink(path, link, PATH_MAX);
+			if (err == PATH_MAX)
+				err = -E2BIG;
+			__putname(path);
+		}
+		if (err < 0) {
+			__putname(link);
+			link = ERR_PTR(err);
+		}
+	} else {
+		link = ERR_PTR(-ENOMEM);
 	}
-	kunmap(page);
-	unlock_page(page);
-	return err;
+
+	nd_set_link(nd, link);
+	return NULL;
 }
 
-static const struct address_space_operations hostfs_link_aops = {
-	.readpage	= hostfs_link_readpage,
+static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
+}
+
+static const struct inode_operations hostfs_link_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= hostfs_follow_link,
+	.put_link	= hostfs_put_link,
 };
 
 static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
@@ -980,65 +923,63 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 		req_root = "";
 
 	err = -ENOMEM;
-	host_root_path = kmalloc(strlen(root_ino) + 1
-				 + strlen(req_root) + 1, GFP_KERNEL);
+	sb->s_fs_info = host_root_path =
+		kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
 	if (host_root_path == NULL)
 		goto out;
 
 	sprintf(host_root_path, "%s/%s", root_ino, req_root);
 
-	root_inode = hostfs_iget(sb);
-	if (IS_ERR(root_inode)) {
-		err = PTR_ERR(root_inode);
-		goto out_free;
-	}
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out;
 
-	err = init_inode(root_inode, NULL);
+	err = read_name(root_inode, host_root_path);
 	if (err)
 		goto out_put;
 
-	HOSTFS_I(root_inode)->host_filename = host_root_path;
-	/*
-	 * Avoid that in the error path, iput(root_inode) frees again
-	 * host_root_path through hostfs_destroy_inode!
-	 */
-	host_root_path = NULL;
+	if (S_ISLNK(root_inode->i_mode)) {
+		char *name = follow_link(host_root_path);
+		if (IS_ERR(name))
+			err = PTR_ERR(name);
+		else
+			err = read_name(root_inode, name);
+		kfree(name);
+		if (err)
+			goto out_put;
+	}
 
 	err = -ENOMEM;
 	sb->s_root = d_alloc_root(root_inode);
 	if (sb->s_root == NULL)
 		goto out_put;
 
-	err = hostfs_read_inode(root_inode);
-	if (err) {
-		/* No iput in this case because the dput does that for us */
-		dput(sb->s_root);
-		sb->s_root = NULL;
-		goto out;
-	}
-
 	return 0;
 
 out_put:
 	iput(root_inode);
-out_free:
-	kfree(host_root_path);
 out:
 	return err;
 }
 
-static int hostfs_read_sb(struct file_system_type *type,
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
 			  int flags, const char *dev_name,
-			  void *data, struct vfsmount *mnt)
+			  void *data)
+{
+	return mount_nodev(type, flags, data, hostfs_fill_sb_common);
+}
+
+static void hostfs_kill_sb(struct super_block *s)
 {
-	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
+	kill_anon_super(s);
+	kfree(s->s_fs_info);
 }
 
 static struct file_system_type hostfs_type = {
 	.owner 		= THIS_MODULE,
 	.name 		= "hostfs",
-	.get_sb 	= hostfs_read_sb,
-	.kill_sb	= kill_anon_super,
+	.mount	 	= hostfs_read_sb,
+	.kill_sb	= hostfs_kill_sb,
 	.fs_flags 	= 0,
 };
 
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index b79424f9328..d51a98384bc 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -19,11 +19,27 @@
 #include "user.h"
 #include <utime.h>
 
-int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
-	      int *nlink_out, int *uid_out, int *gid_out,
-	      unsigned long long *size_out, struct timespec *atime_out,
-	      struct timespec *mtime_out, struct timespec *ctime_out,
-	      int *blksize_out, unsigned long long *blocks_out, int fd)
+static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+{
+	p->ino = buf->st_ino;
+	p->mode = buf->st_mode;
+	p->nlink = buf->st_nlink;
+	p->uid = buf->st_uid;
+	p->gid = buf->st_gid;
+	p->size = buf->st_size;
+	p->atime.tv_sec = buf->st_atime;
+	p->atime.tv_nsec = 0;
+	p->ctime.tv_sec = buf->st_ctime;
+	p->ctime.tv_nsec = 0;
+	p->mtime.tv_sec = buf->st_mtime;
+	p->mtime.tv_nsec = 0;
+	p->blksize = buf->st_blksize;
+	p->blocks = buf->st_blocks;
+	p->maj = os_major(buf->st_rdev);
+	p->min = os_minor(buf->st_rdev);
+}
+
+int stat_file(const char *path, struct hostfs_stat *p, int fd)
 {
 	struct stat64 buf;
 
@@ -33,68 +49,10 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
 	} else if (lstat64(path, &buf) < 0) {
 		return -errno;
 	}
-
-	if (inode_out != NULL)
-		*inode_out = buf.st_ino;
-	if (mode_out != NULL)
-		*mode_out = buf.st_mode;
-	if (nlink_out != NULL)
-		*nlink_out = buf.st_nlink;
-	if (uid_out != NULL)
-		*uid_out = buf.st_uid;
-	if (gid_out != NULL)
-		*gid_out = buf.st_gid;
-	if (size_out != NULL)
-		*size_out = buf.st_size;
-	if (atime_out != NULL) {
-		atime_out->tv_sec = buf.st_atime;
-		atime_out->tv_nsec = 0;
-	}
-	if (mtime_out != NULL) {
-		mtime_out->tv_sec = buf.st_mtime;
-		mtime_out->tv_nsec = 0;
-	}
-	if (ctime_out != NULL) {
-		ctime_out->tv_sec = buf.st_ctime;
-		ctime_out->tv_nsec = 0;
-	}
-	if (blksize_out != NULL)
-		*blksize_out = buf.st_blksize;
-	if (blocks_out != NULL)
-		*blocks_out = buf.st_blocks;
+	stat64_to_hostfs(&buf, p);
 	return 0;
 }
 
-int file_type(const char *path, int *maj, int *min)
-{
- 	struct stat64 buf;
-
-	if (lstat64(path, &buf) < 0)
-		return -errno;
-	/*
-	 * We cannot pass rdev as is because glibc and the kernel disagree
-	 * about its definition.
-	 */
-	if (maj != NULL)
-		*maj = major(buf.st_rdev);
-	if (min != NULL)
-		*min = minor(buf.st_rdev);
-
-	if (S_ISDIR(buf.st_mode))
-		return OS_TYPE_DIR;
-	else if (S_ISLNK(buf.st_mode))
-		return OS_TYPE_SYMLINK;
-	else if (S_ISCHR(buf.st_mode))
-		return OS_TYPE_CHARDEV;
-	else if (S_ISBLK(buf.st_mode))
-		return OS_TYPE_BLOCKDEV;
-	else if (S_ISFIFO(buf.st_mode))
-		return OS_TYPE_FIFO;
-	else if (S_ISSOCK(buf.st_mode))
-		return OS_TYPE_SOCK;
-	else return OS_TYPE_FILE;
-}
-
 int access_file(char *path, int r, int w, int x)
 {
 	int mode = 0;
@@ -136,8 +94,7 @@ void *open_dir(char *path, int *err_out)
 
 	dir = opendir(path);
 	*err_out = errno;
-	if (dir == NULL)
-		return NULL;
+
 	return dir;
 }
 
@@ -202,6 +159,11 @@ int fsync_file(int fd, int datasync)
 	return 0;
 }
 
+int replace_file(int oldfd, int fd)
+{
+	return dup2(oldfd, fd);
+}
+
 void close_file(void *stream)
 {
 	close(*((int *) stream));
@@ -235,14 +197,14 @@ int file_create(char *name, int ur, int uw, int ux, int gr,
 
 int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 {
+	struct hostfs_stat st;
 	struct timeval times[2];
-	struct timespec atime_ts, mtime_ts;
 	int err, ma;
 
 	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
 		if (fd >= 0) {
 			if (fchmod(fd, attrs->ia_mode) != 0)
-				return (-errno);
+				return -errno;
 		} else if (chmod(file, attrs->ia_mode) != 0) {
 			return -errno;
 		}
@@ -279,15 +241,14 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 	 */
 	ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
 	if (attrs->ia_valid & ma) {
-		err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
-				&atime_ts, &mtime_ts, NULL, NULL, NULL, fd);
+		err = stat_file(file, &st, fd);
 		if (err != 0)
 			return err;
 
-		times[0].tv_sec = atime_ts.tv_sec;
-		times[0].tv_usec = atime_ts.tv_nsec / 1000;
-		times[1].tv_sec = mtime_ts.tv_sec;
-		times[1].tv_usec = mtime_ts.tv_nsec / 1000;
+		times[0].tv_sec = st.atime.tv_sec;
+		times[0].tv_usec = st.atime.tv_nsec / 1000;
+		times[1].tv_sec = st.mtime.tv_sec;
+		times[1].tv_usec = st.mtime.tv_nsec / 1000;
 
 		if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
 			times[0].tv_sec = attrs->ia_atime.tv_sec;
@@ -308,9 +269,9 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 
 	/* Note: ctime is not handled */
 	if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
-		err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
-				&attrs->ia_atime, &attrs->ia_mtime, NULL,
-				NULL, NULL, fd);
+		err = stat_file(file, &st, fd);
+		attrs->ia_atime = st.atime;
+		attrs->ia_mtime = st.mtime;
 		if (err != 0)
 			return err;
 	}
@@ -361,7 +322,7 @@ int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
 {
 	int err;
 
-	err = mknod(file, mode, makedev(major, minor));
+	err = mknod(file, mode, os_makedev(major, minor));
 	if (err)
 		return -errno;
 	return 0;
@@ -402,8 +363,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
-	      void *fsid_out, int fsid_size, long *namelen_out,
-	      long *spare_out)
+	      void *fsid_out, int fsid_size, long *namelen_out)
 {
 	struct statfs64 buf;
 	int err;
@@ -422,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	       sizeof(buf.f_fsid) > fsid_size ? fsid_size :
 	       sizeof(buf.f_fsid));
 	*namelen_out = buf.f_namelen;
-	spare_out[0] = buf.f_spare[0];
-	spare_out[1] = buf.f_spare[1];
-	spare_out[2] = buf.f_spare[2];
-	spare_out[3] = buf.f_spare[3];
-	spare_out[4] = buf.f_spare[4];
+
 	return 0;
 }
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6..63b6f563231 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK
+	depends on BKL # nontrivial to fix
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e..793cb9d943d 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("lock creation\n");
 #endif
-	down(&hpfs_sb(s)->hpfs_creation_de);
+	mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("unlock creation\n");
 #endif
-	up(&hpfs_sb(s)->hpfs_creation_de);
+	mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index a9ae9bfa752..c0340887c7e 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -97,10 +97,19 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hpfs_get_block,
 				&hpfs_i(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 75f9d432485..2fee17d0d9a 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
 	unsigned *sb_bmp_dir;		/* main bitmap directory */
 	unsigned sb_c_bitmap;		/* current bitmap */
 	unsigned sb_max_fwd_alloc;	/* max forwad allocation */
-	struct semaphore hpfs_creation_de; /* when creating dirents, nobody else
+	struct mutex hpfs_creation_de;	/* when creating dirents, nobody else
 					   can alloc blocks */
 	/*unsigned sb_mounting : 1;*/
 	int sb_timeshift;
@@ -281,7 +281,7 @@ void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
 int hpfs_setattr(struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
-void hpfs_delete_inode(struct inode *);
+void hpfs_evict_inode(struct inode *);
 
 /* map.c */
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1042a9bc97f..56f0da1cfd1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,15 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		goto out_unlock;
 
-	error = inode_setattr(inode, attr);
-	if (error)
-		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
 	hpfs_write_inode(inode);
 
@@ -296,11 +302,13 @@ void hpfs_write_if_changed(struct inode *inode)
 		hpfs_write_inode(inode);
 }
 
-void hpfs_delete_inode(struct inode *inode)
+void hpfs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
-	lock_kernel();
-	hpfs_remove_fnode(inode->i_sb, inode->i_ino);
-	unlock_kernel();
-	clear_inode(inode);
+	end_writeback(inode);
+	if (!inode->i_nlink) {
+		lock_kernel();
+		hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+		unlock_kernel();
+	}
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index aa53842c599..6c5f01597c3 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -450,7 +450,7 @@ static const struct super_operations hpfs_sops =
 {
 	.alloc_inode	= hpfs_alloc_inode,
 	.destroy_inode	= hpfs_destroy_inode,
-	.delete_inode	= hpfs_delete_inode,
+	.evict_inode	= hpfs_evict_inode,
 	.put_super	= hpfs_put_super,
 	.statfs		= hpfs_statfs,
 	.remount_fs	= hpfs_remount_fs,
@@ -477,17 +477,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	int o;
 
+	lock_kernel();
+
 	save_mount_options(s, options);
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 	s->s_fs_info = sbi;
 
 	sbi->sb_bmp_dir = NULL;
 	sbi->sb_cp_table = NULL;
 
-	init_MUTEX(&sbi->hpfs_creation_de);
+	mutex_init(&sbi->hpfs_creation_de);
 
 	uid = current_uid();
 	gid = current_gid();
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 			root->i_blocks = 5;
 		hpfs_brelse4(&qbh);
 	}
+	unlock_kernel();
 	return 0;
 
 bail4:	brelse(bh2);
@@ -677,20 +682,20 @@ bail0:
 	kfree(sbi->sb_cp_table);
 	s->s_fs_info = NULL;
 	kfree(sbi);
+	unlock_kernel();
 	return -EINVAL;
 }
 
-static int hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
 }
 
 static struct file_system_type hpfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "hpfs",
-	.get_sb		= hpfs_get_sb,
+	.mount		= hpfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 826c3f9d29a..f702b5f713f 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/types.h>
+#include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
 #include "os.h"
 
@@ -597,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.readdir	= hppfs_readdir,
 	.open		= hppfs_dir_open,
 	.fsync		= hppfs_fsync,
+	.llseek		= default_llseek,
 };
 
 static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -623,12 +625,11 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
 	return &hi->vfs_inode;
 }
 
-void hppfs_delete_inode(struct inode *ino)
+void hppfs_evict_inode(struct inode *ino)
 {
+	end_writeback(ino);
 	dput(HPPFS_I(ino)->proc_dentry);
 	mntput(ino->i_sb->s_fs_info);
-
-	clear_inode(ino);
 }
 
 static void hppfs_destroy_inode(struct inode *inode)
@@ -639,7 +640,7 @@ static void hppfs_destroy_inode(struct inode *inode)
 static const struct super_operations hppfs_sbops = {
 	.alloc_inode	= hppfs_alloc_inode,
 	.destroy_inode	= hppfs_destroy_inode,
-	.delete_inode	= hppfs_delete_inode,
+	.evict_inode	= hppfs_evict_inode,
 	.statfs		= hppfs_statfs,
 };
 
@@ -747,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static int hppfs_read_super(struct file_system_type *type,
+static struct dentry *hppfs_read_super(struct file_system_type *type,
 			    int flags, const char *dev_name,
-			    void *data, struct vfsmount *mnt)
+			    void *data)
 {
-	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+	return mount_nodev(type, flags, data, hppfs_fill_super);
 }
 
 static struct file_system_type hppfs_type = {
 	.owner 		= THIS_MODULE,
 	.name 		= "hppfs",
-	.get_sb 	= hppfs_read_super,
+	.mount 		= hppfs_read_super,
 	.kill_sb	= kill_anon_super,
 	.fs_flags 	= 0,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a4e9a7ec369..d6cfac1f0a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -371,27 +372,10 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	hugetlb_unreserve_pages(inode, start, freed);
 }
 
-static void hugetlbfs_delete_inode(struct inode *inode)
+static void hugetlbfs_evict_inode(struct inode *inode)
 {
 	truncate_hugepages(inode, 0);
-	clear_inode(inode);
-}
-
-static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
-{
-	if (generic_detach_inode(inode)) {
-		truncate_hugepages(inode, 0);
-		clear_inode(inode);
-		destroy_inode(inode);
-	}
-}
-
-static void hugetlbfs_drop_inode(struct inode *inode)
-{
-	if (!inode->i_nlink)
-		generic_delete_inode(inode);
-	else
-		hugetlbfs_forget_inode(inode);
+	end_writeback(inode);
 }
 
 static inline void
@@ -448,19 +432,20 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	error = inode_change_ok(inode, attr);
 	if (error)
-		goto out;
+		return error;
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~huge_page_mask(h)))
-			error = hugetlb_vmtruncate(inode, attr->ia_size);
+		if (attr->ia_size & ~huge_page_mask(h))
+			return -EINVAL;
+		error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
-			goto out;
-		attr->ia_valid &= ~ATTR_SIZE;
+			return error;
 	}
-	error = inode_setattr(inode, attr);
-out:
-	return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
@@ -471,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 	inode = new_inode(sb);
 	if (inode) {
 		struct hugetlbfs_inode_info *info;
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
@@ -589,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+				struct page *newpage, struct page *page)
+{
+	int rc;
+
+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+	if (rc)
+		return rc;
+	migrate_page_copy(newpage, page);
+
+	return 0;
+}
+
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -675,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
+	.migratepage    = hugetlbfs_migrate_page,
 };
 
 
@@ -690,6 +690,7 @@ const struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -712,9 +713,8 @@ static const struct inode_operations hugetlbfs_inode_operations = {
 static const struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
+	.evict_inode	= hugetlbfs_evict_inode,
 	.statfs		= hugetlbfs_statfs,
-	.delete_inode	= hugetlbfs_delete_inode,
-	.drop_inode	= hugetlbfs_drop_inode,
 	.put_super	= hugetlbfs_put_super,
 	.show_options	= generic_show_options,
 };
@@ -896,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
 	}
 }
 
-static int hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
 	.name		= "hugetlbfs",
-	.get_sb		= hugetlbfs_get_sb,
+	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/inode.c b/fs/inode.c
index 722860b323a..ae2727ab0c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,16 +20,15 @@
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
-#include <linux/inotify.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/ima.h>
 
 /*
  * This is needed for the following functions:
  *  - inode_has_buffers
- *  - invalidate_inode_buffers
  *  - invalidate_bdev
  *
  * FIXME: remove all knowledge of the buffer layer from this file
@@ -73,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
  * allowing for low-overhead inode sync() operations.
  */
 
-LIST_HEAD(inode_in_use);
-LIST_HEAD(inode_unused);
+static LIST_HEAD(inode_lru);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -104,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+
 static struct kmem_cache *inode_cachep __read_mostly;
 
+static inline int get_nr_inodes(void)
+{
+	return percpu_counter_sum_positive(&nr_inodes);
+}
+
+static inline int get_nr_inodes_unused(void)
+{
+	return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+
+int get_nr_dirty_inodes(void)
+{
+	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	return nr_dirty > 0 ? nr_dirty : 0;
+
+}
+
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	inodes_stat.nr_inodes = get_nr_inodes();
+	inodes_stat.nr_unused = get_nr_inodes_unused();
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
 static void wake_up_inode(struct inode *inode)
 {
 	/*
@@ -193,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_fsnotify_mask = 0;
 #endif
 
+	percpu_counter_inc(&nr_inodes);
+
 	return 0;
 out:
 	return -ENOMEM;
@@ -233,11 +266,13 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
+	percpu_counter_dec(&nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
 {
+	BUG_ON(!list_empty(&inode->i_lru));
 	__destroy_inode(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
@@ -256,6 +291,8 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
+	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_lru);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -264,12 +301,8 @@ void inode_init_once(struct inode *inode)
 	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	i_size_ordered_init(inode);
-#ifdef CONFIG_INOTIFY
-	INIT_LIST_HEAD(&inode->inotify_watches);
-	mutex_init(&inode->inotify_mutex);
-#endif
 #ifdef CONFIG_FSNOTIFY
-	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
 }
 EXPORT_SYMBOL(inode_init_once);
@@ -286,40 +319,137 @@ static void init_once(void *foo)
  */
 void __iget(struct inode *inode)
 {
-	if (atomic_inc_return(&inode->i_count) != 1)
-		return;
+	atomic_inc(&inode->i_count);
+}
+
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
+static void inode_lru_list_add(struct inode *inode)
+{
+	if (list_empty(&inode->i_lru)) {
+		list_add(&inode->i_lru, &inode_lru);
+		percpu_counter_inc(&nr_inodes_unused);
+	}
+}
+
+static void inode_lru_list_del(struct inode *inode)
+{
+	if (!list_empty(&inode->i_lru)) {
+		list_del_init(&inode->i_lru);
+		percpu_counter_dec(&nr_inodes_unused);
+	}
+}
+
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+}
 
-	if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-		list_move(&inode->i_list, &inode_in_use);
-	inodes_stat.nr_unused--;
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	__inode_sb_list_add(inode);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+	list_del_init(&inode->i_sb_list);
+}
+
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+	unsigned long tmp;
+
+	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+			L1_CACHE_BYTES;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+	return tmp & I_HASHMASK;
 }
 
 /**
- * clear_inode - clear an inode
- * @inode: inode to clear
+ *	__insert_inode_hash - hash an inode
+ *	@inode: unhashed inode
+ *	@hashval: unsigned long value used to locate this object in the
+ *		inode_hashtable.
  *
- * This is called by the filesystem to tell us
- * that the inode is no longer useful. We just
- * terminate it with extreme prejudice.
+ *	Add an inode to the inode hash for this superblock.
  */
-void clear_inode(struct inode *inode)
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
-	might_sleep();
-	invalidate_inode_buffers(inode);
+	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
+	spin_lock(&inode_lock);
+	hlist_add_head(&inode->i_hash, b);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
 
+/**
+ *	__remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+	hlist_del_init(&inode->i_hash);
+}
+
+/**
+ *	remove_inode_hash - remove an inode from the hash
+ *	@inode: inode to unhash
+ *
+ *	Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
+
+void end_writeback(struct inode *inode)
+{
+	might_sleep();
 	BUG_ON(inode->i_data.nrpages);
+	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
-	if (inode->i_sb->s_op->clear_inode)
-		inode->i_sb->s_op->clear_inode(inode);
+	inode->i_state = I_FREEING | I_CLEAR;
+}
+EXPORT_SYMBOL(end_writeback);
+
+static void evict(struct inode *inode)
+{
+	const struct super_operations *op = inode->i_sb->s_op;
+
+	if (op->evict_inode) {
+		op->evict_inode(inode);
+	} else {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		end_writeback(inode);
+	}
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
-	inode->i_state = I_CLEAR;
 }
-EXPORT_SYMBOL(clear_inode);
 
 /*
  * dispose_list - dispose of the contents of a local list
@@ -330,104 +460,113 @@ EXPORT_SYMBOL(clear_inode);
  */
 static void dispose_list(struct list_head *head)
 {
-	int nr_disposed = 0;
-
 	while (!list_empty(head)) {
 		struct inode *inode;
 
-		inode = list_first_entry(head, struct inode, i_list);
-		list_del(&inode->i_list);
+		inode = list_first_entry(head, struct inode, i_lru);
+		list_del_init(&inode->i_lru);
 
-		if (inode->i_data.nrpages)
-			truncate_inode_pages(&inode->i_data, 0);
-		clear_inode(inode);
+		evict(inode);
 
 		spin_lock(&inode_lock);
-		hlist_del_init(&inode->i_hash);
-		list_del_init(&inode->i_sb_list);
+		__remove_inode_hash(inode);
+		__inode_sb_list_del(inode);
 		spin_unlock(&inode_lock);
 
 		wake_up_inode(inode);
 		destroy_inode(inode);
-		nr_disposed++;
 	}
-	spin_lock(&inode_lock);
-	inodes_stat.nr_inodes -= nr_disposed;
-	spin_unlock(&inode_lock);
 }
 
-/*
- * Invalidate all inodes for a device.
+/**
+ * evict_inodes	- evict all evictable inodes for a superblock
+ * @sb:		superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
  */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+void evict_inodes(struct super_block *sb)
 {
-	struct list_head *next;
-	int busy = 0, count = 0;
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
 
-	next = head->next;
-	for (;;) {
-		struct list_head *tmp = next;
-		struct inode *inode;
-
-		/*
-		 * We can reschedule here without worrying about the list's
-		 * consistency because the per-sb list of inodes must not
-		 * change during umount anymore, and because iprune_sem keeps
-		 * shrink_icache_memory() away.
-		 */
-		cond_resched_lock(&inode_lock);
+	down_write(&iprune_sem);
 
-		next = next->next;
-		if (tmp == head)
-			break;
-		inode = list_entry(tmp, struct inode, i_sb_list);
-		if (inode->i_state & I_NEW)
+	spin_lock(&inode_lock);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		if (atomic_read(&inode->i_count))
 			continue;
-		invalidate_inode_buffers(inode);
-		if (!atomic_read(&inode->i_count)) {
-			list_move(&inode->i_list, dispose);
-			WARN_ON(inode->i_state & I_NEW);
-			inode->i_state |= I_FREEING;
-			count++;
+
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			WARN_ON(1);
 			continue;
 		}
-		busy = 1;
+
+		inode->i_state |= I_FREEING;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, &dispose);
+		list_del_init(&inode->i_wb_list);
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
 	}
-	/* only unused inodes may be cached with i_count zero */
-	inodes_stat.nr_unused -= count;
-	return busy;
+	spin_unlock(&inode_lock);
+
+	dispose_list(&dispose);
+	up_write(&iprune_sem);
 }
 
 /**
- *	invalidate_inodes	- discard the inodes on a device
- *	@sb: superblock
+ * invalidate_inodes	- attempt to free all inodes on a superblock
+ * @sb:		superblock to operate on
  *
- *	Discard all of the inodes for a given superblock. If the discard
- *	fails because there are busy inodes then a non zero value is returned.
- *	If the discard is successful all the inodes have been discarded.
+ * Attempts to free all inodes for a given superblock.  If there were any
+ * busy inodes return a non-zero value, else zero.
  */
 int invalidate_inodes(struct super_block *sb)
 {
-	int busy;
-	LIST_HEAD(throw_away);
+	int busy = 0;
+	struct inode *inode, *next;
+	LIST_HEAD(dispose);
 
 	down_write(&iprune_sem);
+
 	spin_lock(&inode_lock);
-	inotify_unmount_inodes(&sb->s_inodes);
-	fsnotify_unmount_inodes(&sb->s_inodes);
-	busy = invalidate_list(&sb->s_inodes, &throw_away);
+	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+			continue;
+		if (atomic_read(&inode->i_count)) {
+			busy = 1;
+			continue;
+		}
+
+		inode->i_state |= I_FREEING;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, &dispose);
+		list_del_init(&inode->i_wb_list);
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			percpu_counter_dec(&nr_inodes_unused);
+	}
 	spin_unlock(&inode_lock);
 
-	dispose_list(&throw_away);
+	dispose_list(&dispose);
 	up_write(&iprune_sem);
 
 	return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 
 static int can_unuse(struct inode *inode)
 {
-	if (inode->i_state)
+	if (inode->i_state & ~I_REFERENCED)
 		return 0;
 	if (inode_has_buffers(inode))
 		return 0;
@@ -439,22 +578,24 @@ static int can_unuse(struct inode *inode)
 }
 
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * temporary list and then are freed outside inode_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
- * the front of the inode_unused list.  So look for it there and if the
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
+ * pagecache removed.  If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
  *
- * If the inode has metadata buffers attached to mapping->private_list then
- * try to remove them.
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
  */
 static void prune_icache(int nr_to_scan)
 {
 	LIST_HEAD(freeable);
-	int nr_pruned = 0;
 	int nr_scanned;
 	unsigned long reap = 0;
 
@@ -463,13 +604,26 @@ static void prune_icache(int nr_to_scan)
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 
-		if (list_empty(&inode_unused))
+		if (list_empty(&inode_lru))
 			break;
 
-		inode = list_entry(inode_unused.prev, struct inode, i_list);
+		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
-		if (inode->i_state || atomic_read(&inode->i_count)) {
-			list_move(&inode->i_list, &inode_unused);
+		/*
+		 * Referenced or dirty inodes are still in use. Give them
+		 * another pass through the LRU as we canot reclaim them now.
+		 */
+		if (atomic_read(&inode->i_count) ||
+		    (inode->i_state & ~I_REFERENCED)) {
+			list_del_init(&inode->i_lru);
+			percpu_counter_dec(&nr_inodes_unused);
+			continue;
+		}
+
+		/* recently referenced inodes get one more pass */
+		if (inode->i_state & I_REFERENCED) {
+			list_move(&inode->i_lru, &inode_lru);
+			inode->i_state &= ~I_REFERENCED;
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -481,18 +635,23 @@ static void prune_icache(int nr_to_scan)
 			iput(inode);
 			spin_lock(&inode_lock);
 
-			if (inode != list_entry(inode_unused.next,
-						struct inode, i_list))
+			if (inode != list_entry(inode_lru.next,
+						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
 			if (!can_unuse(inode))
 				continue;
 		}
-		list_move(&inode->i_list, &freeable);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
-		nr_pruned++;
+
+		/*
+		 * Move the inode off the IO lists and LRU once I_FREEING is
+		 * set so that it won't get moved back on there if it is dirty.
+		 */
+		list_move(&inode->i_lru, &freeable);
+		list_del_init(&inode->i_wb_list);
+		percpu_counter_dec(&nr_inodes_unused);
 	}
-	inodes_stat.nr_unused -= nr_pruned;
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
@@ -524,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 			return -1;
 		prune_icache(nr);
 	}
-	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker icache_shrinker = {
@@ -535,9 +694,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
  */
 static struct inode *find_inode(struct super_block *sb,
 				struct hlist_head *head,
@@ -553,13 +709,14 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
-		break;
+		__iget(inode);
+		return inode;
 	}
-	return node ? inode : NULL;
+	return NULL;
 }
 
 /*
@@ -578,57 +735,53 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
-		break;
+		__iget(inode);
+		return inode;
 	}
-	return node ? inode : NULL;
-}
-
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-	unsigned long tmp;
-
-	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-			L1_CACHE_BYTES;
-	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-	return tmp & I_HASHMASK;
-}
-
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-			struct inode *inode)
-{
-	inodes_stat.nr_inodes++;
-	list_add(&inode->i_list, &inode_in_use);
-	list_add(&inode->i_sb_list, &sb->s_inodes);
-	if (head)
-		hlist_add_head(&inode->i_hash, head);
+	return NULL;
 }
 
-/**
- * inode_add_to_lists - add a new inode to relevant lists
- * @sb: superblock inode belongs to
- * @inode: inode to mark in use
+/*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
+ *
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
  *
- * When an inode is allocated it needs to be accounted for, added to the in use
- * list, the owning superblock and the inode hash. This needs to be done under
- * the inode_lock, so export a function to do this rather than the inode lock
- * itself. We calculate the hash list to add to here so it is all internal
- * which requires the caller to have already set up the inode number in the
- * inode to add.
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
  */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+
+unsigned int get_next_ino(void)
 {
-	struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+	unsigned int *p = &get_cpu_var(last_ino);
+	unsigned int res = *p;
 
-	spin_lock(&inode_lock);
-	__inode_add_to_lists(sb, head, inode);
-	spin_unlock(&inode_lock);
+#ifdef CONFIG_SMP
+	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+		static atomic_t shared_last_ino;
+		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+		res = next - LAST_INO_BATCH;
+	}
+#endif
+
+	*p = ++res;
+	put_cpu_var(last_ino);
+	return res;
 }
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
 
 /**
  *	new_inode 	- obtain an inode
@@ -644,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
  */
 struct inode *new_inode(struct super_block *sb)
 {
-	/*
-	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-	 * error if st_ino won't fit in target struct field. Use 32bit counter
-	 * here to attempt to avoid that.
-	 */
-	static unsigned int last_ino;
 	struct inode *inode;
 
 	spin_lock_prefetch(&inode_lock);
@@ -657,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
-		__inode_add_to_lists(sb, NULL, inode);
-		inode->i_ino = ++last_ino;
+		__inode_sb_list_add(inode);
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
 	}
@@ -669,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-	if (inode->i_mode & S_IFDIR) {
+	if (S_ISDIR(inode->i_mode)) {
 		struct file_system_type *type = inode->i_sb->s_type;
 
 		/* Set new key only if filesystem hasn't already changed it */
@@ -726,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
 			if (set(inode, data))
 				goto set_failed;
 
-			__inode_add_to_lists(sb, head, inode);
+			hlist_add_head(&inode->i_hash, head);
+			__inode_sb_list_add(inode);
 			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -741,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		__iget(old);
 		spin_unlock(&inode_lock);
 		destroy_inode(inode);
 		inode = old;
@@ -773,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 			inode->i_ino = ino;
-			__inode_add_to_lists(sb, head, inode);
+			hlist_add_head(&inode->i_hash, head);
+			__inode_sb_list_add(inode);
 			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -788,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		__iget(old);
 		spin_unlock(&inode_lock);
 		destroy_inode(inode);
 		inode = old;
@@ -797,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 	return inode;
 }
 
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+	struct hlist_head *b = inode_hashtable + hash(sb, ino);
+	struct hlist_node *node;
+	struct inode *inode;
+
+	hlist_for_each_entry(inode, node, b, i_hash) {
+		if (inode->i_ino == ino && inode->i_sb == sb)
+			return 0;
+	}
+
+	return 1;
+}
+
 /**
  *	iunique - get a unique inode number
  *	@sb: superblock
@@ -818,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 	 * error if st_ino won't fit in target struct field. Use 32bit counter
 	 * here to attempt to avoid that.
 	 */
+	static DEFINE_SPINLOCK(iunique_lock);
 	static unsigned int counter;
-	struct inode *inode;
-	struct hlist_head *head;
 	ino_t res;
 
 	spin_lock(&inode_lock);
+	spin_lock(&iunique_lock);
 	do {
 		if (counter <= max_reserved)
 			counter = max_reserved + 1;
 		res = counter++;
-		head = inode_hashtable + hash(sb, res);
-		inode = find_inode_fast(sb, head, res);
-	} while (inode != NULL);
+	} while (!test_inode_iunique(sb, res));
+	spin_unlock(&iunique_lock);
 	spin_unlock(&inode_lock);
 
 	return res;
@@ -840,7 +1006,7 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)))
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
 		__iget(inode);
 	else
 		/*
@@ -882,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
 	spin_lock(&inode_lock);
 	inode = find_inode(sb, head, test, data);
 	if (inode) {
-		__iget(inode);
 		spin_unlock(&inode_lock);
 		if (likely(wait))
 			wait_on_inode(inode);
@@ -915,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
 	spin_lock(&inode_lock);
 	inode = find_inode_fast(sb, head, ino);
 	if (inode) {
-		__iget(inode);
 		spin_unlock(&inode_lock);
 		wait_on_inode(inode);
 		return inode;
@@ -1089,7 +1253,7 @@ int insert_inode_locked(struct inode *inode)
 				continue;
 			if (old->i_sb != sb)
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1101,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
-		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
@@ -1128,7 +1292,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 				continue;
 			if (!test(old, data))
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1140,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		__iget(old);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
-		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
 			return -EBUSY;
 		}
@@ -1149,100 +1313,53 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
 
-/**
- *	__insert_inode_hash - hash an inode
- *	@inode: unhashed inode
- *	@hashval: unsigned long value used to locate this object in the
- *		inode_hashtable.
- *
- *	Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-	spin_lock(&inode_lock);
-	hlist_add_head(&inode->i_hash, head);
-	spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
 
-/**
- *	remove_inode_hash - remove an inode from the hash
- *	@inode: inode to unhash
- *
- *	Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
+int generic_delete_inode(struct inode *inode)
 {
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
+	return 1;
 }
-EXPORT_SYMBOL(remove_inode_hash);
+EXPORT_SYMBOL(generic_delete_inode);
 
 /*
- * Tell the filesystem that this inode is no longer of any interest and should
- * be completely destroyed.
- *
- * We leave the inode in the inode hash table until *after* the filesystem's
- * ->delete_inode completes.  This ensures that an iget (such as nfsd might
- * instigate) will always find up-to-date information either in the hash or on
- * disk.
- *
- * I_FREEING is set so that no-one will take a new reference to the inode while
- * it is being deleted.
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
  */
-void generic_delete_inode(struct inode *inode)
+int generic_drop_inode(struct inode *inode)
 {
-	const struct super_operations *op = inode->i_sb->s_op;
-
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	WARN_ON(inode->i_state & I_NEW);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
-	if (op->delete_inode) {
-		void (*delete)(struct inode *) = op->delete_inode;
-		/* Filesystems implementing their own
-		 * s_op->delete_inode are required to call
-		 * truncate_inode_pages and clear_inode()
-		 * internally */
-		delete(inode);
-	} else {
-		truncate_inode_pages(&inode->i_data, 0);
-		clear_inode(inode);
-	}
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
-	wake_up_inode(inode);
-	BUG_ON(inode->i_state != I_CLEAR);
-	destroy_inode(inode);
+	return !inode->i_nlink || inode_unhashed(inode);
 }
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL_GPL(generic_drop_inode);
 
-/**
- *	generic_detach_inode - remove inode from inode lists
- *	@inode: inode to remove
- *
- *	Remove inode from inode lists, write it if it's dirty. This is just an
- *	internal VFS helper exported for hugetlbfs. Do not use!
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
  *
- *	Returns 1 if inode should be completely destroyed.
+ * Call the FS "drop_inode()" function, defaulting to
+ * the legacy UNIX filesystem behaviour.  If it tells
+ * us to evict inode, do so.  Otherwise, retain inode
+ * in cache if fs is alive, sync and evict if fs is
+ * shutting down.
  */
-int generic_detach_inode(struct inode *inode)
+static void iput_final(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	const struct super_operations *op = inode->i_sb->s_op;
+	int drop;
 
-	if (!hlist_unhashed(&inode->i_hash)) {
-		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
+	if (op && op->drop_inode)
+		drop = op->drop_inode(inode);
+	else
+		drop = generic_drop_inode(inode);
+
+	if (!drop) {
 		if (sb->s_flags & MS_ACTIVE) {
+			inode->i_state |= I_REFERENCED;
+			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+				inode_lru_list_add(inode);
+			}
 			spin_unlock(&inode_lock);
-			return 0;
+			return;
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
@@ -1251,65 +1368,28 @@ int generic_detach_inode(struct inode *inode)
 		spin_lock(&inode_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		inodes_stat.nr_unused--;
-		hlist_del_init(&inode->i_hash);
+		__remove_inode_hash(inode);
 	}
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
+
 	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-	return 1;
-}
-EXPORT_SYMBOL_GPL(generic_detach_inode);
 
-static void generic_forget_inode(struct inode *inode)
-{
-	if (!generic_detach_inode(inode))
-		return;
-	if (inode->i_data.nrpages)
-		truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
+	/*
+	 * Move the inode off the IO lists and LRU once I_FREEING is
+	 * set so that it won't get moved back on there if it is dirty.
+	 */
+	inode_lru_list_del(inode);
+	list_del_init(&inode->i_wb_list);
+
+	__inode_sb_list_del(inode);
+	spin_unlock(&inode_lock);
+	evict(inode);
+	remove_inode_hash(inode);
 	wake_up_inode(inode);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	destroy_inode(inode);
 }
 
-/*
- * Normal UNIX filesystem behaviour: delete the
- * inode when the usage count drops to zero, and
- * i_nlink is zero.
- */
-void generic_drop_inode(struct inode *inode)
-{
-	if (!inode->i_nlink)
-		generic_delete_inode(inode);
-	else
-		generic_forget_inode(inode);
-}
-EXPORT_SYMBOL_GPL(generic_drop_inode);
-
-/*
- * Called when we're dropping the last reference
- * to an inode.
- *
- * Call the FS "drop()" function, defaulting to
- * the legacy UNIX filesystem behaviour..
- *
- * NOTE! NOTE! NOTE! We're called with the inode lock
- * held, and the drop function is supposed to release
- * the lock!
- */
-static inline void iput_final(struct inode *inode)
-{
-	const struct super_operations *op = inode->i_sb->s_op;
-	void (*drop)(struct inode *) = generic_drop_inode;
-
-	if (op && op->drop_inode)
-		drop = op->drop_inode;
-	drop(inode);
-}
-
 /**
  *	iput	- put an inode
  *	@inode: inode to put
@@ -1322,7 +1402,7 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
 	if (inode) {
-		BUG_ON(inode->i_state == I_CLEAR);
+		BUG_ON(inode->i_state & I_CLEAR);
 
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
 			iput_final(inode);
@@ -1568,6 +1648,8 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
+	percpu_counter_init(&nr_inodes, 0);
+	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a6..e43b9a4dbf4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/lglock.h>
+
 struct super_block;
 struct linux_binprm;
 struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
 
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
+
 
 /*
  * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
 /*
  * file_table.c
  */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
 
@@ -96,3 +101,10 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a71386..e92fdbb3bc3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
  * @arg:	command-specific argument for ioctl
  *
  * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
- * invokes filesystem specific ->ioctl method.  If neither method exists,
  * returns -ENOTTY.
  *
  * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 {
 	int error = -ENOTTY;
 
-	if (!filp->f_op)
+	if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 		goto out;
 
-	if (filp->f_op->unlocked_ioctl) {
-		error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-		if (error == -ENOIOCTLCMD)
-			error = -EINVAL;
-		goto out;
-	} else if (filp->f_op->ioctl) {
-		lock_kernel();
-		error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-					  filp, cmd, arg);
-		unlock_kernel();
-	}
-
+	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	if (error == -ENOIOCTLCMD)
+		error = -EINVAL;
  out:
 	return error;
 }
@@ -540,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
+static int ioctl_fstrim(struct file *filp, void __user *argp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	struct fstrim_range range;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support trim feature, return. */
+	if (sb->s_op->trim_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	if (argp == NULL) {
+		range.start = 0;
+		range.len = ULLONG_MAX;
+		range.minlen = 0;
+	} else if (copy_from_user(&range, argp, sizeof(range)))
+		return -EFAULT;
+
+	ret = sb->s_op->trim_fs(sb, &range);
+	if (ret < 0)
+		return ret;
+
+	if ((argp != NULL) &&
+	    (copy_to_user(argp, &range, sizeof(range))))
+		return -EFAULT;
+
+	return 0;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -590,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FITRIM:
+		error = ioctl_fstrim(filp, argp);
+		break;
+
 	case FS_IOC_FIEMAP:
 		return ioctl_fiemap(filp, arg);
 
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac6..0542b6eedf8 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
  *
  *  isofs directory handling functions
  */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
 
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
 	char *tmpname;
 	struct iso_directory_record *tmpde;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 
 	tmpname = (char *)__get_free_page(GFP_KERNEL);
 	if (tmpname == NULL)
 		return -ENOMEM;
 
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	tmpde = (struct iso_directory_record *) (tmpname+1024);
 
 	result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
 
 	free_page((unsigned long) tmpname);
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	return result;
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f294..bfdeb82a53b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
 	struct isofs_sb_info *sbi = ISOFS_SB(sb);
 
 #ifdef CONFIG_JOLIET
-	lock_kernel();
-
 	unload_nls(sbi->s_nls_iocharset);
-
-	unlock_kernel();
 #endif
 
 	kfree(sbi);
@@ -549,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 }
 
 /*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+	int offset = 0, files = 0, de_len;
+	struct iso_directory_record *de;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return true;
+	while (files < 3) {
+		de = (struct iso_directory_record *) (bh->b_data + offset);
+		de_len = *(unsigned char *) de;
+		if (de_len == 0)
+			break;
+		files++;
+		offset += de_len;
+	}
+	brelse(bh);
+	return files < 3;
+}
+
+/*
  * Initialize the superblock and read the root inode.
  *
  * Note: a check_disk_change() has been done immediately prior
@@ -722,7 +745,12 @@ root_found:
 	}
 
 	s->s_magic = ISOFS_SUPER_MAGIC;
-	s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
+
+	/*
+	 * With multi-extent files, file size is only limited by the maximum
+	 * size of a file system, which is 8 TB.
+	 */
+	s->s_maxbytes = 0x80000000000LL;
 
 	/*
 	 * The CDROM is read-only, has no nodes (devices) on it, and since
@@ -818,6 +846,7 @@ root_found:
 	sbi->s_utf8 = opt.utf8;
 	sbi->s_nocompress = opt.nocompress;
 	sbi->s_overriderockperm = opt.overriderockperm;
+	mutex_init(&sbi->s_mutex);
 	/*
 	 * It would be incredibly stupid to allow people to mark every file
 	 * on the disk as suid, so we merely allow them to set the default
@@ -842,6 +871,18 @@ root_found:
 		goto out_no_root;
 
 	/*
+	 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+	 * correct Joliet root directory.
+	 */
+	if (sbi->s_rock == 1 && joliet_level &&
+				rootdir_empty(s, sbi->s_firstdatazone)) {
+		printk(KERN_NOTICE
+			"ISOFS: primary root directory is empty. "
+			"Disabling Rock Ridge and switching to Joliet.");
+		sbi->s_rock = 0;
+	}
+
+	/*
 	 * If this disk has both Rock Ridge and Joliet on it, then we
 	 * want to use Rock Ridge by default.  This can be overridden
 	 * by using the norock mount option.  There is still one other
@@ -961,27 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
  * or getblk() if they are not.  Returns the number of blocks inserted
  * (-ve == error.)
  */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
 		     struct buffer_head **bh, unsigned long nblocks)
 {
-	unsigned long b_off;
+	unsigned long b_off = iblock;
 	unsigned offset, sect_size;
 	unsigned int firstext;
 	unsigned long nextblk, nextoff;
-	long iblock = (long)iblock_s;
 	int section, rv, error;
 	struct iso_inode_info *ei = ISOFS_I(inode);
 
-	lock_kernel();
-
 	error = -EIO;
 	rv = 0;
-	if (iblock < 0 || iblock != iblock_s) {
+	if (iblock != b_off) {
 		printk(KERN_DEBUG "%s: block number too large\n", __func__);
 		goto abort;
 	}
 
-	b_off = iblock;
 
 	offset = 0;
 	firstext = ei->i_first_extent;
@@ -999,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 		 * I/O errors.
 		 */
 		if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-			printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
-				__func__, iblock, (unsigned long) inode->i_size);
+			printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
+				__func__, b_off,
+				(unsigned long long)inode->i_size);
 			goto abort;
 		}
 
@@ -1026,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 			if (++section > 100) {
 				printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
 					" aborting...\n", __func__);
-				printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+				printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
 					"nextblk=%lu nextoff=%lu\n", __func__,
-					iblock, firstext, (unsigned) sect_size,
+					b_off, firstext, (unsigned) sect_size,
 					nextblk, nextoff);
 				goto abort;
 			}
@@ -1049,7 +1087,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 
 	error = 0;
 abort:
-	unlock_kernel();
 	return rv != 0 ? rv : error;
 }
 
@@ -1470,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static int isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
-				mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
 }
 
 static struct file_system_type iso9660_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "iso9660",
-	.get_sb		= isofs_get_sb,
+	.mount		= isofs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52..2882dc089f8 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
 	gid_t s_gid;
 	uid_t s_uid;
 	struct nls_table *s_nls_iocharset; /* Native language support table */
+	struct mutex s_mutex; /* replaces BKL, please remove if possible */
 };
 
 #define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867..0d23abfd428 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
 
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	int found;
 	unsigned long uninitialized_var(block);
 	unsigned long uninitialized_var(offset);
+	struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
 	struct inode *inode;
 	struct page *page;
 
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	found = isofs_find_entry(dir, dentry,
 				&block, &offset,
 				page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	if (found) {
 		inode = isofs_iget(dir->i_sb, block, offset);
 		if (IS_ERR(inode)) {
-			unlock_kernel();
+			mutex_unlock(&sbi->s_mutex);
 			return ERR_CAST(inode);
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	return d_splice_alias(inode, dentry);
 }
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550f..f9cd04db6ea 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
 
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include "isofs.h"
 #include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct iso_inode_info *ei = ISOFS_I(inode);
+	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 	char *link = kmap(page);
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
 	struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 	struct rock_state rs;
 	int ret;
 
-	if (!ISOFS_SB(inode->i_sb)->s_rock)
+	if (!sbi->s_rock)
 		goto error;
 
 	init_rock_state(&rs, inode);
 	block = ei->i_iget5_block;
-	lock_kernel();
+	mutex_lock(&sbi->s_mutex);
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh)
 		goto out_noread;
@@ -748,7 +748,7 @@ repeat:
 		goto fail;
 	brelse(bh);
 	*rpnt = '\0';
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 	SetPageUptodate(page);
 	kunmap(page);
 	unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
 	printk("symlink spans iso9660 blocks\n");
 fail:
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&sbi->s_mutex);
 error:
 	SetPageError(page);
 	kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654..e4b87bc1fa5 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
 			goto restart;
 		}
 		if (buffer_locked(bh)) {
-			atomic_inc(&bh->b_count);
+			get_bh(bh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, bhs);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -281,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 	int ret = 0;
 
 	if (buffer_locked(bh)) {
-		atomic_inc(&bh->b_count);
+		get_bh(bh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c4..34a4861c14b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	journal_header_t *header;
 	int ret;
-	int barrier_done = 0;
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -137,34 +136,12 @@ static int journal_write_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "write commit block");
 	set_buffer_dirty(bh);
-	if (journal->j_flags & JFS_BARRIER) {
-		set_buffer_ordered(bh);
-		barrier_done = 1;
-	}
-	ret = sync_dirty_buffer(bh);
-	if (barrier_done)
-		clear_buffer_ordered(bh);
-	/* is it possible for another commit to fail at roughly
-	 * the same time as this one?  If so, we don't want to
-	 * trust the barrier flag in the super, but instead want
-	 * to remember if we sent a barrier request
-	 */
-	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
-		spin_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JFS_BARRIER;
-		spin_unlock(&journal->j_state_lock);
 
-		/* And try again, without the barrier */
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+	if (journal->j_flags & JFS_BARRIER)
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+	else
 		ret = sync_dirty_buffer(bh);
-	}
+
 	put_bh(bh);		/* One for getblk() */
 	journal_put_journal_head(descriptor);
 
@@ -317,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -610,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-		atomic_inc(&jh2bh(jh)->b_count);
+		get_bh(jh2bh(jh));
 
 		/* Make a temporary IO buffer with which to write it out
                    (this will requeue both the metadata buffer and the
                    temporary IO buffer). new_bh goes on BJ_IO*/
 
-		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+		set_buffer_jwrite(jh2bh(jh));
 		/*
 		 * akpm: journal_write_metadata_buffer() sets
 		 * new_bh->b_transaction to commit_transaction.
@@ -626,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = journal_write_metadata_buffer(commit_transaction,
 						      jh, &new_jh, blocknr);
-		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+		set_buffer_jwrite(jh2bh(new_jh));
 		wbuf[bufs++] = jh2bh(new_jh);
 
 		/* Record the new block's tag in the current descriptor
@@ -736,7 +713,7 @@ wait_for_iobuf:
                    shadowed buffer */
 		jh = commit_transaction->t_shadow_list->b_tprev;
 		bh = jh2bh(jh);
-		clear_bit(BH_JWrite, &bh->b_state);
+		clear_buffer_jwrite(bh);
 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 
 		/* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 93d1e47647b..da1b5e4ffce 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
 
 /*
  * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
 	 */
 	if (!tid_geq(journal->j_commit_request, target)) {
 		/*
-		 * We want a new commit: OK, mark the request and wakup the
+		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
 		 */
 
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
 		if (err)
 			return err;
 		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (unlikely(!bh))
+			return -ENOMEM;
 		lock_buffer(bh);
 		memset (bh->b_data, 0, journal->j_blocksize);
 		BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
 		goto out;
 	}
 
+	if (buffer_write_io_error(bh)) {
+		char b[BDEVNAME_SIZE];
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal_dev_name(journal, b));
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
 	spin_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,10 +1042,18 @@ void journal_update_superblock(journal_t *journal, int wait)
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
-	if (wait)
+	if (wait) {
 		sync_dirty_buffer(bh);
-	else
-		ll_rw_block(SWRITE, 1, &bh);
+		if (buffer_write_io_error(bh)) {
+			char b[BDEVNAME_SIZE];
+			printk(KERN_ERR "JBD: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal_dev_name(journal, b));
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
+		write_dirty_buffer(bh, WRITE);
 
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
@@ -1281,13 +1310,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
 int journal_check_available_features (journal_t *journal, unsigned long compat,
 				      unsigned long ro, unsigned long incompat)
 {
-	journal_superblock_t *sb;
-
 	if (!compat && !ro && !incompat)
 		return 1;
 
-	sb = journal->j_superblock;
-
 	/* We can support any known requested features iff the
 	 * superblock is in version 2.  Otherwise we fail to support any
 	 * extended sb features. */
@@ -1481,7 +1506,6 @@ int journal_flush(journal_t *journal)
 
 int journal_wipe(journal_t *journal, int write)
 {
-	journal_superblock_t *sb;
 	int err = 0;
 
 	J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1490,8 +1514,6 @@ int journal_wipe(journal_t *journal, int write)
 	if (err)
 		return err;
 
-	sb = journal->j_superblock;
-
 	if (!journal->j_tail)
 		goto no_recovery;
 
@@ -1726,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
 	struct journal_head *ret;
-	static unsigned long last_warning;
 
 #ifdef CONFIG_JBD_DEBUG
 	atomic_inc(&nr_journal_heads);
@@ -1734,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
 	ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
 	if (ret == NULL) {
 		jbd_debug(1, "out of memory for journal_head\n");
-		if (time_after(jiffies, last_warning + 5*HZ)) {
-			printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-			       __func__);
-			last_warning = jiffies;
-		}
+		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+				   __func__);
+
 		while (ret == NULL) {
 			yield();
 			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b1..5b43e96788e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
 int journal_skip_recovery(journal_t *journal)
 {
 	int			err;
-	journal_superblock_t *	sb;
-
 	struct recovery_info	info;
 
 	memset (&info, 0, sizeof(info));
-	sb = journal->j_superblock;
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
 
@@ -297,11 +294,12 @@ int journal_skip_recovery(journal_t *journal)
 		++journal->j_transaction_sequence;
 	} else {
 #ifdef CONFIG_JBD_DEBUG
-		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
-#endif
+		int dropped = info.end_transaction -
+			      be32_to_cpu(journal->j_superblock->s_sequence);
 		jbd_debug(1,
 			  "JBD: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
+#endif
 		journal->j_transaction_sequence = ++info.end_transaction;
 	}
 
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
 	unsigned int		sequence;
 	int			blocktype;
 
-	/* Precompute the maximum metadata descriptors in a descriptor block */
-	int			MAX_BLOCKS_PER_DESC;
-	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-			       / sizeof(journal_block_tag_t));
-
 	/*
 	 * First thing is to establish what we expect to find in the log
 	 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343..d29018307e2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+	write_dirty_buffer(bh, write_op);
 }
 #endif
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a49..846a3f31411 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		jbd_free_handle(handle);
 		current->journal_info = NULL;
 		handle = ERR_PTR(err);
-		goto out;
 	}
-out:
 	return handle;
 }
 
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	transaction = handle->h_transaction;
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -713,7 +711,7 @@ done:
 		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
 			    "Possible IO failure.\n");
 		page = jh2bh(jh)->b_page;
-		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+		offset = offset_in_page(jh2bh(jh)->b_data);
 		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f9..6a79fd0a1a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
 	int nblocks, space_left;
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	nblocks = jbd_space_needed(journal);
 	while (__jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		 * filesystem, so abort the journal and leave a stack
 		 * trace for forensic evidence.
 		 */
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 			if (journal->j_committing_transaction)
 				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
 			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
 		}
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
+
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = journal->j_chkpt_bhs[i];
 		clear_buffer_jwrite(bh);
@@ -297,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
+		if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+			/*
+			 * The journal thread is dead; so starting and
+			 * waiting for a commit to finish will cause
+			 * us to wait for a _very_ long time.
+			 */
+			printk(KERN_ERR "JBD2: %s: "
+			       "Waiting for Godot: block %llu\n",
+			       journal->j_devname,
+			       (unsigned long long) bh->b_blocknr);
 		jbd2_log_start_commit(journal, tid);
 		jbd2_log_wait_commit(journal, tid);
 		ret = 1;
@@ -474,7 +486,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * next transaction ID we will write, and where it will
 	 * start. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction) {
@@ -496,7 +508,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	/* If the oldest pinned transaction is at the tail of the log
            already then there's not much we can do right now. */
 	if (journal->j_tail_sequence == first_tid) {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 1;
 	}
 
@@ -516,7 +528,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	journal->j_free += freed;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * If there is an external journal, we need to make sure that
@@ -530,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
@@ -775,7 +786,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 75716d3d2be..f3ad1598b20 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -101,7 +103,6 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	int ret;
-	int barrier_done = 0;
 	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
@@ -135,33 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
-				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-		set_buffer_ordered(bh);
-		barrier_done = 1;
-	}
-	ret = submit_bh(WRITE_SYNC_PLUG, bh);
-	if (barrier_done)
-		clear_buffer_ordered(bh);
-
-	/* is it possible for another commit to fail at roughly
-	 * the same time as this one?  If so, we don't want to
-	 * trust the barrier flag in the super, but instead want
-	 * to remember if we sent a barrier request
-	 */
-	if (ret == -EOPNOTSUPP && barrier_done) {
-		printk(KERN_WARNING
-		       "JBD: barrier-based sync failed on %s - "
-		       "disabling barriers\n", journal->j_devname);
-		spin_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
-
-		/* And try again, without the barrier */
-		lock_buffer(bh);
-		set_buffer_uptodate(bh);
-		clear_buffer_dirty(bh);
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+	else
 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-	}
+
 	*cbh = bh;
 	return ret;
 }
@@ -175,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
 	int ret = 0;
 
-retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-		printk(KERN_WARNING
-		       "JBD2: wait_on_commit_record: sync failed on %s - "
-		       "disabling barriers\n", journal->j_devname);
-		spin_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
-
-		lock_buffer(bh);
-		clear_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		bh->b_end_io = journal_end_buffer_io_sync;
-
-		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-		if (ret) {
-			unlock_buffer(bh);
-			return ret;
-		}
-		goto retry;
-	}
 
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
@@ -245,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 		mapping = jinode->i_vfs_inode->i_mapping;
-		jinode->i_flags |= JI_COMMIT_RUNNING;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
 		/*
 		 * submit the inode data buffers. We use writepage
@@ -260,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
 		commit_transaction->t_flushed_data_blocks = 1;
-		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -281,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-		jinode->i_flags |= JI_COMMIT_RUNNING;
+		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 		if (err) {
@@ -297,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 				ret = err;
 		}
 		spin_lock(&journal->j_list_lock);
-		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+		smp_mb__after_clear_bit();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 
@@ -369,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -400,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
 	/*
@@ -417,23 +377,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					      stats.run.rs_locked);
 
 	spin_lock(&commit_transaction->t_handle_lock);
-	while (commit_transaction->t_updates) {
+	while (atomic_read(&commit_transaction->t_updates)) {
 		DEFINE_WAIT(wait);
 
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 					TASK_UNINTERRUPTIBLE);
-		if (commit_transaction->t_updates) {
+		if (atomic_read(&commit_transaction->t_updates)) {
 			spin_unlock(&commit_transaction->t_handle_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 			spin_lock(&commit_transaction->t_handle_lock);
 		}
 		finish_wait(&journal->j_wait_updates, &wait);
 	}
 	spin_unlock(&commit_transaction->t_handle_lock);
 
-	J_ASSERT (commit_transaction->t_outstanding_credits <=
+	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 			journal->j_max_transaction_buffers);
 
 	/*
@@ -497,7 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	jbd_debug (3, "JBD: commit phase 2\n");
 
@@ -519,19 +479,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.run.rs_logging = jiffies;
 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 					       stats.run.rs_logging);
-	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+	stats.run.rs_blocks =
+		atomic_read(&commit_transaction->t_outstanding_credits);
 	stats.run.rs_blocks_logged = 0;
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
-		 commit_transaction->t_outstanding_credits);
+		 atomic_read(&commit_transaction->t_outstanding_credits));
 
 	err = 0;
 	descriptor = NULL;
@@ -616,7 +577,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * the free space in the log, but this counter is changed
 		 * by jbd2_journal_next_log_block() also.
 		 */
-		commit_transaction->t_outstanding_credits--;
+		atomic_dec(&commit_transaction->t_outstanding_credits);
 
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
@@ -709,6 +670,16 @@ start_journal_io:
 		}
 	}
 
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
+
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
@@ -717,8 +688,7 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,19 +697,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
-	}
-
-	err = journal_finish_inode_data_buffers(journal, commit_transaction);
-	if (err) {
-		printk(KERN_WARNING
-			"JBD2: Detected IO errors while flushing file data "
-		       "on %s\n", journal->j_devname);
-		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-			jbd2_journal_abort(journal, err);
-		err = 0;
 	}
 
 	/* Lo and behold: we have just managed to send a transaction to
@@ -853,6 +810,11 @@ wait_for_iobuf:
 	}
 	if (!err && !is_journal_aborted(journal))
 		err = journal_wait_on_commit_record(journal, cbh);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	    journal->j_flags & JBD2_BARRIER) {
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+	}
 
 	if (err)
 		jbd2_journal_abort(journal, err);
@@ -977,7 +939,7 @@ restart_loop:
 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
 	 * other checkpointing code processing the transaction...
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	/*
 	 * Now recheck if some buffers did not get attached to the transaction
@@ -985,7 +947,7 @@ restart_loop:
 	 */
 	if (commit_transaction->t_forget) {
 		spin_unlock(&journal->j_list_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		goto restart_loop;
 	}
 
@@ -1003,7 +965,8 @@ restart_loop:
 	 * File the transaction statistics
 	 */
 	stats.ts_tid = commit_transaction->t_tid;
-	stats.run.rs_handle_count = commit_transaction->t_handle_count;
+	stats.run.rs_handle_count =
+		atomic_read(&commit_transaction->t_handle_count);
 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 			     commit_transaction->t_tid, &stats.run);
 
@@ -1037,7 +1000,7 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 036880895bf..c590d155c09 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,15 +41,16 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
+#include <linux/bitops.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 
-EXPORT_SYMBOL(jbd2_journal_start);
-EXPORT_SYMBOL(jbd2_journal_restart);
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +144,7 @@ static int kjournald2(void *arg)
 	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +155,10 @@ loop:
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
 		jbd_debug(1, "OK, requests differ\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		goto loop;
 	}
 
@@ -169,9 +170,9 @@ loop:
 		 * be already stopped.
 		 */
 		jbd_debug(1, "Now suspending kjournald2\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		refrigerator();
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	} else {
 		/*
 		 * We assume on resume that commits are already there,
@@ -191,9 +192,9 @@ loop:
 		if (journal->j_flags & JBD2_UNMOUNT)
 			should_sleep = 0;
 		if (should_sleep) {
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		}
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
@@ -211,7 +212,7 @@ loop:
 	goto loop;
 
 end_loop:
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
@@ -234,16 +235,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
 
 static void journal_kill_thread(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_UNMOUNT;
 
 	while (journal->j_task) {
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -310,7 +311,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+retry_alloc:
+	new_bh = alloc_buffer_head(GFP_NOFS);
+	if (!new_bh) {
+		/*
+		 * Failure is not an option, but __GFP_NOFAIL is going
+		 * away; so we retry ourselves here.
+		 */
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry_alloc;
+	}
+
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
@@ -442,7 +453,7 @@ int __jbd2_log_space_left(journal_t *journal)
 {
 	int left = journal->j_free;
 
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	/*
 	 * Be pessimistic here about the number of those free blocks which
@@ -469,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 	 */
 	if (!tid_geq(journal->j_commit_request, target)) {
 		/*
-		 * We want a new commit: OK, mark the request and wakup the
+		 * We want a new commit: OK, mark the request and wakeup the
 		 * commit thread.  We do _not_ do the commit ourselves.
 		 */
 
@@ -487,9 +498,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 {
 	int ret;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	ret = __jbd2_log_start_commit(journal, tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -508,7 +519,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 	transaction_t *transaction = NULL;
 	tid_t tid;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
 		transaction = journal->j_running_transaction;
 		__jbd2_log_start_commit(journal, transaction->t_tid);
@@ -516,12 +527,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 		transaction = journal->j_committing_transaction;
 
 	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		return 0;	/* Nothing to retry */
 	}
 
 	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	jbd2_log_wait_commit(journal, tid);
 	return 1;
 }
@@ -535,7 +546,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
 	int ret = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
@@ -554,7 +565,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -566,26 +577,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
 	int err = 0;
 
+	read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
-	spin_lock(&journal->j_state_lock);
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_EMERG
 		       "%s: error: j_commit_request=%d, tid=%d\n",
 		       __func__, journal->j_commit_request, tid);
 	}
-	spin_unlock(&journal->j_state_lock);
 #endif
-	spin_lock(&journal->j_state_lock);
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit,
 				!tid_gt(tid, journal->j_commit_sequence));
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	if (unlikely(is_journal_aborted(journal))) {
 		printk(KERN_EMERG "journal commit I/O error\n");
@@ -602,7 +611,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 {
 	unsigned long blocknr;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
 
 	blocknr = journal->j_head;
@@ -610,7 +619,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 	journal->j_free--;
 	if (journal->j_head == journal->j_last)
 		journal->j_head = journal->j_first;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return jbd2_journal_bmap(journal, blocknr, retp);
 }
 
@@ -830,7 +839,7 @@ static journal_t * journal_init_common (void)
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
 	spin_lock_init(&journal->j_list_lock);
-	spin_lock_init(&journal->j_state_lock);
+	rwlock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_min_batch_time = 0;
@@ -1096,14 +1105,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 		set_buffer_uptodate(bh);
 	}
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(journal->j_tail);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
@@ -1117,19 +1126,19 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 			set_buffer_uptodate(bh);
 		}
 	} else
-		ll_rw_block(SWRITE, 1, &bh);
+		write_dirty_buffer(bh, WRITE);
 
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
 	 * any future commit will have to be careful to update the
 	 * superblock again to re-record the true start of the log. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (sb->s_start)
 		journal->j_flags &= ~JBD2_FLUSHED;
 	else
 		journal->j_flags |= JBD2_FLUSHED;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -1364,6 +1373,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 
 	if (!compat && !ro && !incompat)
 		return 1;
+	/* Load journal superblock if it is not loaded yet. */
+	if (journal->j_format_version == 0 &&
+	    journal_get_superblock(journal) != 0)
+		return 0;
 	if (journal->j_format_version == 1)
 		return 0;
 
@@ -1391,13 +1404,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
 				      unsigned long ro, unsigned long incompat)
 {
-	journal_superblock_t *sb;
-
 	if (!compat && !ro && !incompat)
 		return 1;
 
-	sb = journal->j_superblock;
-
 	/* We can support any known requested features iff the
 	 * superblock is in version 2.  Otherwise we fail to support any
 	 * extended sb features. */
@@ -1545,7 +1554,7 @@ int jbd2_journal_flush(journal_t *journal)
 	transaction_t *transaction = NULL;
 	unsigned long old_tail;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 	/* Force everything buffered to the log... */
 	if (journal->j_running_transaction) {
@@ -1558,10 +1567,10 @@ int jbd2_journal_flush(journal_t *journal)
 	if (transaction) {
 		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		jbd2_log_wait_commit(journal, tid);
 	} else {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 	}
 
 	/* ...and flush everything in the log out to disk. */
@@ -1585,12 +1594,12 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	old_tail = journal->j_tail;
 	journal->j_tail = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	jbd2_journal_update_superblock(journal, 1);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_tail = old_tail;
 
 	J_ASSERT(!journal->j_running_transaction);
@@ -1598,7 +1607,7 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(!journal->j_checkpoint_transactions);
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return 0;
 }
 
@@ -1617,7 +1626,6 @@ int jbd2_journal_flush(journal_t *journal)
 
 int jbd2_journal_wipe(journal_t *journal, int write)
 {
-	journal_superblock_t *sb;
 	int err = 0;
 
 	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1626,8 +1634,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (err)
 		return err;
 
-	sb = journal->j_superblock;
-
 	if (!journal->j_tail)
 		goto no_recovery;
 
@@ -1665,12 +1671,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
 	printk(KERN_ERR "Aborting journal on device %s.\n",
 	       journal->j_devname);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
 	transaction = journal->j_running_transaction;
 	if (transaction)
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /* Soft abort: record the abort error status in the journal superblock,
@@ -1755,12 +1761,12 @@ int jbd2_journal_errno(journal_t *journal)
 {
 	int err;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		err = journal->j_errno;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1775,12 +1781,12 @@ int jbd2_journal_clear_err(journal_t *journal)
 {
 	int err = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		journal->j_errno = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1793,10 +1799,10 @@ int jbd2_journal_clear_err(journal_t *journal)
  */
 void jbd2_journal_ack_err(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_errno)
 		journal->j_flags |= JBD2_ACK_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -1832,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
  */
 #define JBD2_MAX_SLABS 8
 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-static DECLARE_MUTEX(jbd2_slab_create_sem);
 
 static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
 	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1853,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
 
 static int jbd2_journal_create_slab(size_t size)
 {
+	static DEFINE_MUTEX(jbd2_slab_create_mutex);
 	int i = order_base_2(size) - 10;
 	size_t slab_size;
 
@@ -1864,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
 
 	if (unlikely(i < 0))
 		i = 0;
-	down(&jbd2_slab_create_sem);
+	mutex_lock(&jbd2_slab_create_mutex);
 	if (jbd2_slab[i]) {
-		up(&jbd2_slab_create_sem);
+		mutex_unlock(&jbd2_slab_create_mutex);
 		return 0;	/* Already created */
 	}
 
 	slab_size = 1 << (i+10);
 	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
 					 slab_size, 0, NULL);
-	up(&jbd2_slab_create_sem);
+	mutex_unlock(&jbd2_slab_create_mutex);
 	if (!jbd2_slab[i]) {
 		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
 		return -ENOMEM;
@@ -2201,14 +2207,12 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
 void jbd2_journal_release_jbd_inode(journal_t *journal,
 				    struct jbd2_inode *jinode)
 {
-	int writeout = 0;
-
 	if (!journal)
 		return;
 restart:
 	spin_lock(&journal->j_list_lock);
 	/* Is commit writing out inode - we have to wait */
-	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+	if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
 		wait_queue_head_t *wq;
 		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
 		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -2219,9 +2223,6 @@ restart:
 		goto restart;
 	}
 
-	/* Do we need to wait for data writeback? */
-	if (journal->j_committing_transaction == jinode->i_transaction)
-		writeout = 1;
 	if (jinode->i_transaction) {
 		list_del(&jinode->i_list);
 		jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb8..2bc4d5f116f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
 int jbd2_journal_skip_recovery(journal_t *journal)
 {
 	int			err;
-	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
 
 	memset (&info, 0, sizeof(info));
-	sb = journal->j_superblock;
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
 
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 		++journal->j_transaction_sequence;
 	} else {
 #ifdef CONFIG_JBD2_DEBUG
-		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+		int dropped = info.end_transaction - 
+			be32_to_cpu(journal->j_superblock->s_sequence);
 #endif
 		jbd_debug(1,
 			  "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
 	int			tag_bytes = journal_tag_bytes(journal);
 	__u32			crc32_sum = ~0; /* Transactional Checksums */
 
-	/* Precompute the maximum metadata descriptors in a descriptor block */
-	int			MAX_BLOCKS_PER_DESC;
-	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-			       / tag_bytes);
-
 	/*
 	 * First thing is to establish what we expect to find in the log
 	 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e..9ad321fd63f 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+	write_dirty_buffer(bh, write_op);
 }
 #endif
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b8e0806681b..6bf0a242613 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	atomic_set(&transaction->t_updates, 0);
+	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_handle_count, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
@@ -77,71 +82,107 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  */
 
 /*
+ * Update transiaction's maximum wait time, if debugging is enabled.
+ *
+ * In order for t_max_wait to be reliable, it must be protected by a
+ * lock.  But doing so will mean that start_this_handle() can not be
+ * run in parallel on SMP systems, which limits our scalability.  So
+ * unless debugging is enabled, we no longer update t_max_wait, which
+ * means that maximum wait time reported by the jbd2_run_stats
+ * tracepoint will always be zero.
+ */
+static inline void update_t_max_wait(transaction_t *transaction)
+{
+#ifdef CONFIG_JBD2_DEBUG
+	unsigned long ts = jiffies;
+
+	if (jbd2_journal_enable_debug &&
+	    time_after(transaction->t_start, ts)) {
+		ts = jbd2_time_diff(ts, transaction->t_start);
+		spin_lock(&transaction->t_handle_lock);
+		if (ts > transaction->t_max_wait)
+			transaction->t_max_wait = ts;
+		spin_unlock(&transaction->t_handle_lock);
+	}
+#endif
+}
+
+/*
  * start_this_handle: Given a handle, deal with any locking or stalling
  * needed to make sure that there is enough journal space for the handle
  * to begin.  Attach the handle to a transaction and set up the
  * transaction's buffer credits.
  */
 
-static int start_this_handle(journal_t *journal, handle_t *handle)
+static int start_this_handle(journal_t *journal, handle_t *handle,
+			     int gfp_mask)
 {
 	transaction_t *transaction;
 	int needed;
 	int nblocks = handle->h_buffer_credits;
 	transaction_t *new_transaction = NULL;
-	int ret = 0;
-	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
 		       current->comm, nblocks,
 		       journal->j_max_transaction_buffers);
-		ret = -ENOSPC;
-		goto out;
+		return -ENOSPC;
 	}
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
+		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
 		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
+			/*
+			 * If __GFP_FS is not present, then we may be
+			 * being called from inside the fs writeback
+			 * layer, so we MUST NOT fail.  Since
+			 * __GFP_NOFAIL is going away, we will arrange
+			 * to retry the allocation ourselves.
+			 */
+			if ((gfp_mask & __GFP_FS) == 0) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto alloc_transaction;
+			}
+			return -ENOMEM;
 		}
 	}
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
-repeat:
-
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
 	 * for proper journal barrier handling
 	 */
-	spin_lock(&journal->j_state_lock);
-repeat_locked:
+repeat:
+	read_lock(&journal->j_state_lock);
+	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
-		spin_unlock(&journal->j_state_lock);
-		ret = -EROFS;
-		goto out;
+		read_unlock(&journal->j_state_lock);
+		kfree(new_transaction);
+		return -EROFS;
 	}
 
 	/* Wait on the journal's transaction barrier if necessary */
 	if (journal->j_barrier_count) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_transaction_locked,
 				journal->j_barrier_count == 0);
 		goto repeat;
 	}
 
 	if (!journal->j_running_transaction) {
-		if (!new_transaction) {
-			spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
+		if (!new_transaction)
 			goto alloc_transaction;
+		write_lock(&journal->j_state_lock);
+		if (!journal->j_running_transaction) {
+			jbd2_get_transaction(journal, new_transaction);
+			new_transaction = NULL;
 		}
-		jbd2_get_transaction(journal, new_transaction);
-		new_transaction = NULL;
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	transaction = journal->j_running_transaction;
@@ -155,7 +196,7 @@ repeat_locked:
 
 		prepare_to_wait(&journal->j_wait_transaction_locked,
 					&wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -166,8 +207,8 @@ repeat_locked:
 	 * buffers requested by this operation, we need to stall pending a log
 	 * checkpoint to free some more log space.
 	 */
-	spin_lock(&transaction->t_handle_lock);
-	needed = transaction->t_outstanding_credits + nblocks;
+	needed = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
 
 	if (needed > journal->j_max_transaction_buffers) {
 		/*
@@ -178,11 +219,11 @@ repeat_locked:
 		DEFINE_WAIT(wait);
 
 		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -215,35 +256,31 @@ repeat_locked:
 	 */
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
-		__jbd2_log_wait_for_space(journal);
-		goto repeat_locked;
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	/* OK, account for the buffers that this operation expects to
-	 * use and add the handle to the running transaction. */
-
-	if (time_after(transaction->t_start, ts)) {
-		ts = jbd2_time_diff(ts, transaction->t_start);
-		if (ts > transaction->t_max_wait)
-			transaction->t_max_wait = ts;
-	}
-
+	 * use and add the handle to the running transaction. 
+	 */
+	update_t_max_wait(transaction);
 	handle->h_transaction = transaction;
-	transaction->t_outstanding_credits += nblocks;
-	transaction->t_updates++;
-	transaction->t_handle_count++;
+	atomic_inc(&transaction->t_updates);
+	atomic_inc(&transaction->t_handle_count);
 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks, transaction->t_outstanding_credits,
+		  handle, nblocks,
+		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
-	spin_unlock(&transaction->t_handle_lock);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
-out:
-	if (unlikely(new_transaction))		/* It's usually NULL */
-		kfree(new_transaction);
-	return ret;
+	kfree(new_transaction);
+	return 0;
 }
 
 static struct lock_class_key jbd2_handle_key;
@@ -278,7 +315,7 @@ static handle_t *new_handle(int nblocks)
  *
  * Return a pointer to a newly allocated handle, or NULL on failure
  */
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -298,7 +335,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 
 	current->journal_info = handle;
 
-	err = start_this_handle(journal, handle);
+	err = start_this_handle(journal, handle, gfp_mask);
 	if (err < 0) {
 		jbd2_free_handle(handle);
 		current->journal_info = NULL;
@@ -308,6 +345,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 out:
 	return handle;
 }
+EXPORT_SYMBOL(jbd2__journal_start);
+
+
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
+
 
 /**
  * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +388,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 
 	result = 1;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 
 	/* Don't extend a locked-down transaction! */
 	if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +398,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	spin_lock(&transaction->t_handle_lock);
-	wanted = transaction->t_outstanding_credits + nblocks;
+	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 
 	if (wanted > journal->j_max_transaction_buffers) {
 		jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +413,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	handle->h_buffer_credits += nblocks;
-	transaction->t_outstanding_credits += nblocks;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	result = 0;
 
 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
 unlock:
 	spin_unlock(&transaction->t_handle_lock);
 error_out:
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 out:
 	return result;
 }
@@ -394,8 +440,7 @@ out:
  * transaction capabable of guaranteeing the requested number of
  * credits.
  */
-
-int jbd2_journal_restart(handle_t *handle, int nblocks)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
@@ -410,29 +455,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-
-	if (!transaction->t_updates)
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+	if (atomic_dec_and_test(&transaction->t_updates))
 		wake_up(&journal->j_wait_updates);
 	spin_unlock(&transaction->t_handle_lock);
 
 	jbd_debug(2, "restarting handle %p\n", handle);
 	__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
-	ret = start_this_handle(journal, handle);
+	ret = start_this_handle(journal, handle, gfp_mask);
 	return ret;
 }
+EXPORT_SYMBOL(jbd2__journal_restart);
 
 
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
+
 /**
  * void jbd2_journal_lock_updates () - establish a transaction barrier.
  * @journal:  Journal to establish a barrier on.
@@ -447,7 +498,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 {
 	DEFINE_WAIT(wait);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	++journal->j_barrier_count;
 
 	/* Wait until there are no running updates */
@@ -458,19 +509,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
 			break;
 
 		spin_lock(&transaction->t_handle_lock);
-		if (!transaction->t_updates) {
+		if (!atomic_read(&transaction->t_updates)) {
 			spin_unlock(&transaction->t_handle_lock);
 			break;
 		}
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 				TASK_UNINTERRUPTIBLE);
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_updates, &wait);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * We have now established a barrier against other normal updates, but
@@ -494,9 +545,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
 	J_ASSERT(journal->j_barrier_count != 0);
 
 	mutex_unlock(&journal->j_barrier);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	--journal->j_barrier_count;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_transaction_locked);
 }
 
@@ -1238,7 +1289,8 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int err;
+	int err, wait_for_commit = 0;
+	tid_t tid;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1246,7 +1298,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 	else {
-		J_ASSERT(transaction->t_updates > 0);
+		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 		err = 0;
 	}
 
@@ -1291,9 +1343,9 @@ int jbd2_journal_stop(handle_t *handle)
 
 		journal->j_last_sync_writer = pid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		commit_time = journal->j_average_commit_time;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
@@ -1314,14 +1366,8 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-	if (!transaction->t_updates) {
-		wake_up(&journal->j_wait_updates);
-		if (journal->j_barrier_count)
-			wake_up(&journal->j_wait_transaction_locked);
-	}
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
 
 	/*
 	 * If the handle is marked SYNC, we need to set another commit
@@ -1330,15 +1376,13 @@ int jbd2_journal_stop(handle_t *handle)
 	 * transaction is too old now.
 	 */
 	if (handle->h_sync ||
-			transaction->t_outstanding_credits >
-				journal->j_max_transaction_buffers ||
-			time_after_eq(jiffies, transaction->t_expires)) {
+	    (atomic_read(&transaction->t_outstanding_credits) >
+	     journal->j_max_transaction_buffers) ||
+	    time_after_eq(jiffies, transaction->t_expires)) {
 		/* Do this even for aborted journals: an abort still
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
-		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&transaction->t_handle_lock);
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
@@ -1349,11 +1393,25 @@ int jbd2_journal_stop(handle_t *handle)
 		 * to wait for the commit to complete.
 		 */
 		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-			err = jbd2_log_wait_commit(journal, tid);
-	} else {
-		spin_unlock(&transaction->t_handle_lock);
+			wait_for_commit = 1;
 	}
 
+	/*
+	 * Once we drop t_updates, if it goes to zero the transaction
+	 * could start commiting on us and eventually disappear.  So
+	 * once we do this, we must not dereference transaction
+	 * pointer again.
+	 */
+	tid = transaction->t_tid;
+	if (atomic_dec_and_test(&transaction->t_updates)) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	if (wait_for_commit)
+		err = jbd2_log_wait_commit(journal, tid);
+
 	lock_map_release(&handle->h_lockdep_map);
 
 	jbd2_free_handle(handle);
@@ -1719,7 +1777,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		goto zap_buffer_unlocked;
 
 	/* OK, we have data buffer in journaled mode */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
 
@@ -1772,7 +1830,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 			jbd2_journal_put_journal_head(jh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			return ret;
 		} else {
 			/* There is no currently-running transaction. So the
@@ -1786,7 +1844,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 				jbd2_journal_put_journal_head(jh);
 				spin_unlock(&journal->j_list_lock);
 				jbd_unlock_bh_state(bh);
-				spin_unlock(&journal->j_state_lock);
+				write_unlock(&journal->j_state_lock);
 				return ret;
 			} else {
 				/* The orphan record's transaction has
@@ -1810,7 +1868,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 0;
 	} else {
 		/* Good, the buffer belongs to the running transaction.
@@ -1829,7 +1887,7 @@ zap_buffer:
 zap_buffer_no_jh:
 	spin_unlock(&journal->j_list_lock);
 	jbd_unlock_bh_state(bh);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 zap_buffer_unlocked:
 	clear_buffer_dirty(bh);
 	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2136,9 +2194,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 	/* Locks are here just to force reading of recent values, it is
 	 * enough that the transaction was not committing before we started
 	 * a transaction adding the inode to orphan list */
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	inode_trans = jinode->i_transaction;
 	spin_unlock(&journal->j_list_lock);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 55f1dde2fa8..404111b016c 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c5e1450d79f..85c6be2db02 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -22,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
 static inline struct jffs2_inode_cache *
 first_inode_chain(int *i, struct jffs2_sb_info *c)
 {
-	for (; *i < INOCACHE_HASHSIZE; (*i)++) {
+	for (; *i < c->inocache_hashsize; (*i)++) {
 		if (c->inocache_list[*i])
 			return c->inocache_list[*i];
 	}
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f0294410868..de4247021d2 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -2,11 +2,12 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
- * Created by Arjan van de Ven <arjanv@redhat.com>
- *
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
  *		    University of Szeged, Hungary
  *
+ * Created by Arjan van de Ven <arjan@infradead.org>
+ *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
  */
@@ -102,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			spin_unlock(&jffs2_compressor_list_lock);
 			*datalen  = orig_slen;
 			*cdatalen = orig_dlen;
-			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL);
+			compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
 			spin_lock(&jffs2_compressor_list_lock);
 			this->usecount--;
 			if (!compr_ret) {
@@ -151,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			spin_unlock(&jffs2_compressor_list_lock);
 			*datalen  = orig_slen;
 			*cdatalen = orig_dlen;
-			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL);
+			compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
 			spin_lock(&jffs2_compressor_list_lock);
 			this->usecount--;
 			if (!compr_ret) {
@@ -219,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			if (comprtype == this->compr) {
 				this->usecount++;
 				spin_unlock(&jffs2_compressor_list_lock);
-				ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL);
+				ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
 				spin_lock(&jffs2_compressor_list_lock);
 				if (ret) {
 					printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 7d1d72faa77..13bb7597ab3 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -3,6 +3,7 @@
  *
  * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
  *		      University of Szeged, Hungary
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
@@ -48,9 +49,9 @@ struct jffs2_compressor {
 	char *name;
 	char compr;			/* JFFS2_COMPR_XXX */
 	int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
-			uint32_t *srclen, uint32_t *destlen, void *model);
+			uint32_t *srclen, uint32_t *destlen);
 	int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
-			  uint32_t cdatalen, uint32_t datalen, void *model);
+			  uint32_t cdatalen, uint32_t datalen);
 	int usecount;
 	int disabled;		/* if set the compressor won't compress */
 	unsigned char *compr_buf;	/* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index cd02acafde8..af186ee674d 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Richard Purdie <rpurdie@openedhand.com>
  *
@@ -41,7 +42,7 @@ static int __init alloc_workspace(void)
 }
 
 static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
-			      uint32_t *sourcelen, uint32_t *dstlen, void *model)
+			      uint32_t *sourcelen, uint32_t *dstlen)
 {
 	size_t compress_size;
 	int ret;
@@ -66,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 }
 
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-				 uint32_t srclen, uint32_t destlen, void *model)
+				 uint32_t srclen, uint32_t destlen)
 {
 	size_t dl = destlen;
 	int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 546d1538d07..16a5047903a 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Arjan van de Ven <arjanv@redhat.com>
  *
@@ -30,8 +31,7 @@
 /* _compress returns the compressed size, -1 if bigger */
 static int jffs2_rtime_compress(unsigned char *data_in,
 				unsigned char *cpage_out,
-				uint32_t *sourcelen, uint32_t *dstlen,
-				void *model)
+				uint32_t *sourcelen, uint32_t *dstlen)
 {
 	short positions[256];
 	int outpos = 0;
@@ -72,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
 
 static int jffs2_rtime_decompress(unsigned char *data_in,
 				  unsigned char *cpage_out,
-				  uint32_t srclen, uint32_t destlen,
-				  void *model)
+				  uint32_t srclen, uint32_t destlen)
 {
 	short positions[256];
 	int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 170d289ac78..9e7cec808c4 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Arjan van de Ven <arjanv@redhat.com>
  *
@@ -297,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 #if 0
 /* _compress returns the compressed size, -1 if bigger */
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
-		   uint32_t *sourcelen, uint32_t *dstlen, void *model)
+		   uint32_t *sourcelen, uint32_t *dstlen)
 {
 	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
 				 cpage_out, sourcelen, dstlen);
@@ -305,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
 				   unsigned char *cpage_out,
-				   uint32_t *sourcelen, uint32_t *dstlen,
-				   void *model)
+				   uint32_t *sourcelen, uint32_t *dstlen)
 {
 	int bits[8];
 	unsigned char histo[256];
@@ -386,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
 
 static int jffs2_rubinmips_decompress(unsigned char *data_in,
 				      unsigned char *cpage_out,
-				      uint32_t sourcelen, uint32_t dstlen,
-				      void *model)
+				      uint32_t sourcelen, uint32_t dstlen)
 {
 	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
 			    cpage_out, sourcelen, dstlen);
@@ -396,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 
 static int jffs2_dynrubin_decompress(unsigned char *data_in,
 				     unsigned char *cpage_out,
-				     uint32_t sourcelen, uint32_t dstlen,
-				     void *model)
+				     uint32_t sourcelen, uint32_t dstlen)
 {
 	int bits[8];
 	int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index b46661a4275..fd05a0b9431 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -67,8 +68,7 @@ static void free_workspaces(void)
 
 static int jffs2_zlib_compress(unsigned char *data_in,
 			       unsigned char *cpage_out,
-			       uint32_t *sourcelen, uint32_t *dstlen,
-			       void *model)
+			       uint32_t *sourcelen, uint32_t *dstlen)
 {
 	int ret;
 
@@ -135,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
 
 static int jffs2_zlib_decompress(unsigned char *data_in,
 				 unsigned char *cpage_out,
-				 uint32_t srclen, uint32_t destlen,
-				 void *model)
+				 uint32_t srclen, uint32_t destlen)
 {
 	int ret;
 	int wbits = MAX_WBITS;
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index ec353841392..e0b76c87a91 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index a113ecc3baf..c4f8eef5ca6 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 166062a6823..92978658ed1 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -232,9 +233,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	return 0;
 
  fail:
-	make_bad_inode(inode);
-	unlock_new_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	jffs2_free_raw_inode(ri);
 	return ret;
 }
@@ -290,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, old_dentry->d_inode);
 		dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-		atomic_inc(&old_dentry->d_inode->i_count);
+		ihold(old_dentry->d_inode);
 	}
 	return ret;
 }
@@ -368,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	}
 
 	/* We use f->target field to store the target path. */
-	f->target = kmalloc(targetlen + 1, GFP_KERNEL);
+	f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
 	if (!f->target) {
 		printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
 		mutex_unlock(&f->sem);
@@ -377,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		goto fail;
 	}
 
-	memcpy(f->target, target, targetlen + 1);
 	D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
 
 	/* No data here. Only a metadata node, which will be
@@ -454,9 +452,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	return 0;
 
  fail:
-	make_bad_inode(inode);
-	unlock_new_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	return ret;
 }
 
@@ -601,9 +597,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	return 0;
 
  fail:
-	make_bad_inode(inode);
-	unlock_new_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	return ret;
 }
 
@@ -778,9 +772,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	return 0;
 
  fail:
-	make_bad_inode(inode);
-	unlock_new_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	return ret;
 }
 
@@ -871,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
 		/* Might as well let the VFS know */
 		d_instantiate(new_dentry, old_dentry->d_inode);
-		atomic_inc(&old_dentry->d_inode->i_count);
+		ihold(old_dentry->d_inode);
 		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
 		return ret;
 	}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 6286ad9b00f..e513f1913c1 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -150,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 		}
 
 		/* Be nice */
-		yield();
+		cond_resched();
 		mutex_lock(&c->erase_free_sem);
 		spin_lock(&c->erase_completion_lock);
 	}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 81349702443..1c0a08d711a 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 459d39d1ea0..e896e67767e 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -20,7 +21,6 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	mutex_unlock(&f->sem);
 	jffs2_complete_reservation(c);
 
-	/* We have to do the simple_setsize() without f->sem held, since
+	/* We have to do the truncate_setsize() without f->sem held, since
 	   some pages may be locked and waiting for it in readpage().
 	   We are protected from a simultaneous write() extending i_size
 	   back past iattr->ia_size, because do_truncate() holds the
 	   generic inode semaphore. */
 	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-		simple_setsize(inode, iattr->ia_size);
+		truncate_setsize(inode, iattr->ia_size);
 		inode->i_blocks = (inode->i_size + 511) >> 9;
 	}	
 
@@ -225,7 +225,7 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 
-void jffs2_clear_inode (struct inode *inode)
+void jffs2_evict_inode (struct inode *inode)
 {
 	/* We can forget about this inode for now - drop all
 	 *  the nodelists associated with it, etc.
@@ -233,7 +233,9 @@ void jffs2_clear_inode (struct inode *inode)
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 
-	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+	D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	jffs2_do_clear_inode(c, f);
 }
 
@@ -388,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
-	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -400,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 		jffs2_start_garbage_collect_thread(c);
 
 	*flags |= MS_NOATIME;
-
-	unlock_kernel();
 	return 0;
 }
 
@@ -475,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	return inode;
 }
 
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+	/*
+	 * Pick a inocache hash size based on the size of the medium.
+	 * Count how many megabytes we're dealing with, apply a hashsize twice
+	 * that size, but rounding down to the usual big powers of 2. And keep
+	 * to sensible bounds.
+	 */
+
+	int size_mb = flash_size / 1024 / 1024;
+	int hashsize = (size_mb * 2) & ~0x3f;
+
+	if (hashsize < INOCACHE_HASHSIZE_MIN)
+		return INOCACHE_HASHSIZE_MIN;
+	if (hashsize > INOCACHE_HASHSIZE_MAX)
+		return INOCACHE_HASHSIZE_MAX;
+
+	return hashsize;
+}
 
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 {
@@ -521,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	if (ret)
 		return ret;
 
-	c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+	c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+	c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
 	if (!c->inocache_list) {
 		ret = -ENOMEM;
 		goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f5e96bd656e..31dce611337 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -218,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	if (!list_empty(&c->erase_complete_list) ||
 	    !list_empty(&c->erase_pending_list)) {
 		spin_unlock(&c->erase_completion_lock);
+		mutex_unlock(&c->alloc_sem);
 		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
-		if (jffs2_erase_pending_blocks(c, 1)) {
-			mutex_unlock(&c->alloc_sem);
+		if (jffs2_erase_pending_blocks(c, 1))
 			return 0;
-		}
+
 		D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
 		spin_lock(&c->erase_completion_lock);
+		mutex_lock(&c->alloc_sem);
 	}
 
 	/* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 9d41f43e47b..859a598af02 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index c6923da9826..2e4a86763c0 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 85ef6dbb1be..f864005de64 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
@@ -99,6 +100,7 @@ struct jffs2_sb_info {
 	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
 
 	wait_queue_head_t inocache_wq;
+	int inocache_hashsize;
 	struct jffs2_inode_cache **inocache_list;
 	spinlock_t inocache_lock;
 
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd13846..5e03233c236 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
 {
 	struct jffs2_inode_cache *ret;
 
-	ret = c->inocache_list[ino % INOCACHE_HASHSIZE];
+	ret = c->inocache_list[ino % c->inocache_hashsize];
 	while (ret && ret->ino < ino) {
 		ret = ret->next;
 	}
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
 
 	dbg_inocache("add %p (ino #%u)\n", new, new->ino);
 
-	prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE];
+	prev = &c->inocache_list[new->ino % c->inocache_hashsize];
 
 	while ((*prev) && (*prev)->ino < new->ino) {
 		prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
 	dbg_inocache("del %p (ino #%u)\n", old, old->ino);
 	spin_lock(&c->inocache_lock);
 
-	prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE];
+	prev = &c->inocache_list[old->ino % c->inocache_hashsize];
 
 	while ((*prev) && (*prev)->ino < old->ino) {
 		prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
 	int i;
 	struct jffs2_inode_cache *this, *next;
 
-	for (i=0; i<INOCACHE_HASHSIZE; i++) {
+	for (i=0; i < c->inocache_hashsize; i++) {
 		this = c->inocache_list[i];
 		while (this) {
 			next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index a881a42f19e..5a53d9bdb2b 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -24,7 +24,6 @@
 #ifdef __ECOS
 #include "os-ecos.h"
 #else
-#include <linux/mtd/compatmac.h> /* For compatibility with older kernels */
 #include "os-linux.h"
 #endif
 
@@ -200,7 +199,8 @@ struct jffs2_inode_cache {
 #define RAWNODE_CLASS_XATTR_DATUM	1
 #define RAWNODE_CLASS_XATTR_REF		2
 
-#define INOCACHE_HASHSIZE 128
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
 
 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 4791aacf308..00bae7cc2e4 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -171,7 +171,7 @@ extern const struct inode_operations jffs2_symlink_inode_operations;
 int jffs2_setattr (struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
-void jffs2_clear_inode (struct inode *);
+void jffs2_evict_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc3..b632dddcb48 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
 #include "summary.h"
 #include "debug.h"
 
-#define DEFAULT_EMPTY_SCAN_SIZE 1024
+#define DEFAULT_EMPTY_SCAN_SIZE 256
 
 #define noisy_printk(noise, args...) do { \
 	if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
 	struct jffs2_unknown_node crcnode;
-	uint32_t ofs, prevofs;
+	uint32_t ofs, prevofs, max_ofs;
 	uint32_t hdr_crc, buf_ofs, buf_len;
 	int err;
 	int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 
 	/* We temporarily use 'ofs' as a pointer into the buffer/jeb */
 	ofs = 0;
-
-	/* Scan only 4KiB of 0xFF before declaring it's empty */
-	while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+	max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
+	/* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
+	while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
 		ofs += 4;
 
-	if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) {
+	if (ofs == max_ofs) {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 		if (jffs2_cleanmarker_oob(c)) {
 			/* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 511e2d609d1..c86041b866a 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -135,7 +134,7 @@ static const struct super_operations jffs2_super_operations =
 	.write_super =	jffs2_write_super,
 	.statfs =	jffs2_statfs,
 	.remount_fs =	jffs2_remount_fs,
-	.clear_inode =	jffs2_clear_inode,
+	.evict_inode =	jffs2_evict_inode,
 	.dirty_inode =	jffs2_dirty_inode,
 	.sync_fs =	jffs2_sync_fs,
 };
@@ -146,6 +145,7 @@ static const struct super_operations jffs2_super_operations =
 static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct jffs2_sb_info *c;
+	int ret;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
 		  " New superblock for device %d (\"%s\")\n",
@@ -175,15 +175,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
 #endif
-	return jffs2_do_fill_super(sb, data, silent);
+	ret = jffs2_do_fill_super(sb, data, silent);
+	return ret;
 }
 
-static int jffs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super,
-			  mnt);
+	return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
 }
 
 static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		jffs2_write_super(sb);
 
@@ -215,8 +213,6 @@ static void jffs2_put_super (struct super_block *sb)
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
-	unlock_kernel();
-
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
@@ -232,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
 static struct file_system_type jffs2_fs_type = {
 	.owner =	THIS_MODULE,
 	.name =		"jffs2",
-	.get_sb =	jffs2_get_sb,
+	.mount =	jffs2_mount,
 	.kill_sb =	jffs2_kill_sb,
 };
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d258e261bdc..9b572ca40a4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -588,7 +588,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-	/* It's called from jffs2_clear_inode() on inode removing.
+	/* It's called from jffs2_evict_inode() on inode removing.
 	   When an inode with XATTR is removed, those XATTRs must be removed. */
 	struct jffs2_xattr_ref *ref, *_ref;
 
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 127263cc865..c5ce6c1d1ff 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -17,6 +17,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
@@ -107,11 +108,18 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 			return rc;
 	}
 
-	rc = inode_setattr(inode, iattr);
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, iattr->ia_size);
+		if (rc)
+			return rc;
+	}
 
-	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = jfs_acl_chmod(inode);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 
+	if (iattr->ia_valid & ATTR_MODE)
+		rc = jfs_acl_chmod(inode);
 	return rc;
 }
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ed9ba6fe04f..9978803ceed 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,31 +145,32 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		return 0;
 }
 
-void jfs_delete_inode(struct inode *inode)
+void jfs_evict_inode(struct inode *inode)
 {
-	jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+	jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
 
-	if (!is_bad_inode(inode))
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		dquot_initialize(inode);
 
-	if (!is_bad_inode(inode) &&
-	    (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
-		truncate_inode_pages(&inode->i_data, 0);
+		if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+			truncate_inode_pages(&inode->i_data, 0);
 
-		if (test_cflag(COMMIT_Freewmap, inode))
-			jfs_free_zero_link(inode);
+			if (test_cflag(COMMIT_Freewmap, inode))
+				jfs_free_zero_link(inode);
 
-		diFree(inode);
+			diFree(inode);
 
-		/*
-		 * Free the inode from the quota allocation.
-		 */
-		dquot_initialize(inode);
-		dquot_free_inode(inode);
-		dquot_drop(inode);
+			/*
+			 * Free the inode from the quota allocation.
+			 */
+			dquot_initialize(inode);
+			dquot_free_inode(inode);
+		}
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
 	}
-
-	clear_inode(inode);
+	end_writeback(inode);
+	dquot_drop(inode);
 }
 
 void jfs_dirty_inode(struct inode *inode)
@@ -303,8 +304,17 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
-	return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
 				jfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
@@ -317,9 +327,24 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				offset, nr_segs, jfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 const struct address_space_operations jfs_aops = {
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb..3a09423b6c2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	 * appear hashed, but do not put on any lists.  hlist_del()
 	 * will work fine and require no locking.
 	 */
-	ip->i_hash.pprev = &ip->i_hash.next;
+	hlist_add_fake(&ip->i_hash);
 
 	return (ip);
 }
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 11042b1f44b..155e91eff07 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -27,7 +27,7 @@ extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
 extern int jfs_write_inode(struct inode *, struct writeback_control *);
-extern void jfs_delete_inode(struct inode *);
+extern void jfs_evict_inode(struct inode *);
 extern void jfs_dirty_inode(struct inode *);
 extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a1451..e1b8493b9aa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 		 * option 2 - shutdown file systems
 		 *	      associated with log ?
 		 * option 3 - extend log ?
-		 */
-		/*
 		 * option 4 - second chance
 		 *
 		 * mark log wrapped, and continue.
 		 * when all active transactions are completed,
-		 * mark log vaild for recovery.
+		 * mark log valid for recovery.
 		 * if crashed during invalid state, log state
-		 * implies invald log, forcing fsck().
+		 * implies invalid log, forcing fsck().
 		 */
 		/* mark log state log wrap in log superblock */
 		/* log->state = LOGWRAP; */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45..9895595fd2f 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
 
 	ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
 	if (ipaimap == NULL) {
-		jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+		jfs_err("jfs_mount: Failed to read AGGREGATE_I");
 		rc = -EIO;
 		goto errout20;
 	}
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
 	if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
 		ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
 		if (!ipaimap2) {
-			jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+			jfs_err("jfs_mount: Failed to read AGGREGATE_I");
 			rc = -EIO;
 			goto errout35;
 		}
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b44..9466957ec84 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
 	 * lazy commit thread finishes processing
 	 */
 	if (tblk->xflag & COMMIT_DELETE) {
-		atomic_inc(&tblk->u.ip->i_count);
+		ihold(tblk->u.ip);
 		/*
 		 * Avoid a rare deadlock
 		 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675b..231ca4af9bc 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
 	ip->i_ctime = CURRENT_TIME;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dir);
-	atomic_inc(&ip->i_count);
+	ihold(ip);
 
 	iplist[0] = ip;
 	iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b38f96bef82..0669fc1cc3b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -132,11 +131,6 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static void jfs_clear_inode(struct inode *inode)
-{
-	dquot_drop(inode);
-}
-
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -181,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
 
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	lock_kernel();
-
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
@@ -193,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
 	iput(sbi->direct_inode);
 
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 enum {
@@ -374,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
-	lock_kernel();
+
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
-			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc) {
-			unlock_kernel();
+		if (rc)
 			return rc;
-		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -402,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		/* mark the fs r/w for quota activity */
 		sb->s_flags &= ~MS_RDONLY;
 
-		unlock_kernel();
 		dquot_resume(sb, -1);
 		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
 		if (rc < 0) {
-			unlock_kernel();
 			return rc;
 		}
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
-		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc) {
-				unlock_kernel();
+			if (rc)
 				return rc;
-			}
+
 			JFS_SBI(sb)->flag = flag;
 			ret = jfs_mount_rw(sb, 1);
-			unlock_kernel();
 			return ret;
 		}
 	JFS_SBI(sb)->flag = flag;
 
-	unlock_kernel();
 	return 0;
 }
 
@@ -451,6 +432,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
 	sb->s_fs_info = sbi;
 	sbi->sb = sb;
 	sbi->uid = sbi->gid = sbi->umask = -1;
@@ -601,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
 	return 0;
 }
 
-static int jfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -765,8 +746,7 @@ static const struct super_operations jfs_super_operations = {
 	.destroy_inode	= jfs_destroy_inode,
 	.dirty_inode	= jfs_dirty_inode,
 	.write_inode	= jfs_write_inode,
-	.delete_inode	= jfs_delete_inode,
-	.clear_inode	= jfs_clear_inode,
+	.evict_inode	= jfs_evict_inode,
 	.put_super	= jfs_put_super,
 	.sync_fs	= jfs_sync_fs,
 	.freeze_fs	= jfs_freeze,
@@ -789,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
 static struct file_system_type jfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "jfs",
-	.get_sb		= jfs_get_sb,
+	.mount		= jfs_do_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fa96bbb2634..2d7f165d0f1 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -86,46 +86,25 @@ struct ea_buffer {
 #define EA_MALLOC	0x0008
 
 
+static int is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
 /*
  * These three routines are used to recognize on-disk extended attributes
  * that are in a recognized namespace.  If the attribute is not recognized,
  * "os2." is prepended to the name
  */
-static inline int is_os2_xattr(struct jfs_ea *ea)
+static int is_os2_xattr(struct jfs_ea *ea)
 {
-	/*
-	 * Check for "system."
-	 */
-	if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "user."
-	 */
-	if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "security."
-	 */
-	if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_SECURITY_PREFIX,
-		     XATTR_SECURITY_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "trusted."
-	 */
-	if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		return false;
-	/*
-	 * Add any other valid namespace prefixes here
-	 */
-
-	/*
-	 * We assume it's OS/2's flat namespace
-	 */
-	return true;
+	return !is_known_namespace(ea->name);
 }
 
 static inline int name_size(struct jfs_ea *ea)
@@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return can_set_system_xattr(inode, name, value, value_len);
 
+	if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+		/*
+		 * This makes sure that we aren't trying to set an
+		 * attribute in a different namespace by prefixing it
+		 * with "os2."
+		 */
+		if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+				return -EOPNOTSUPP;
+		return 0;
+	}
+
 	/*
 	 * Don't allow setting an attribute in an unknown namespace.
 	 */
 	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
 	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
-	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
-	    strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
 		return -EOPNOTSUPP;
 
 	return 0;
@@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
 	int xattr_size;
 	ssize_t size;
 	int namelen = strlen(name);
-	char *os2name = NULL;
 	char *value;
 
-	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
-		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
-				  GFP_KERNEL);
-		if (!os2name)
-			return -ENOMEM;
-		strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
-		name = os2name;
-		namelen -= XATTR_OS2_PREFIX_LEN;
-	}
-
 	down_read(&JFS_IP(inode)->xattr_sem);
 
 	xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
       out:
 	up_read(&JFS_IP(inode)->xattr_sem);
 
-	kfree(os2name);
-
 	return size;
 }
 
@@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 {
 	int err;
 
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		/*
+		 * skip past "os2." prefix
+		 */
+		name += XATTR_OS2_PREFIX_LEN;
+		/*
+		 * Don't allow retrieving properly prefixed attributes
+		 * by prepending them with "os2."
+		 */
+		if (is_known_namespace(name))
+			return -EOPNOTSUPP;
+	}
+
 	err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
 
 	return err;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcaf972cbf1..a3accdf528a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-int get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	const struct super_operations *ops, unsigned long magic,
-	struct vfsmount *mnt)
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
+	const struct super_operations *ops, unsigned long magic)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 Enomem:
 	deactivate_locked_super(s);
-	return -ENOMEM;
+	return ERR_PTR(-ENOMEM);
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,7 +253,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	dget(dentry);
 	d_instantiate(dentry, inode);
 	return 0;
@@ -327,77 +325,35 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 /**
- * simple_setsize - handle core mm and vfs requirements for file size change
- * @inode: inode
- * @newsize: new file size
- *
- * Returns 0 on success, -error on failure.
- *
- * simple_setsize must be called with inode_mutex held.
- *
- * simple_setsize will check that the requested new size is OK (see
- * inode_newsize_ok), and then will perform the necessary i_size update
- * and pagecache truncation (if necessary). It will be typically be called
- * from the filesystem's setattr function when ATTR_SIZE is passed in.
- *
- * The inode itself must have correct permissions and attributes to allow
- * i_size to be changed, this function then just checks that the new size
- * requested is valid.
- *
- * In the case of simple in-memory filesystems with inodes stored solely
- * in the inode cache, and file data in the pagecache, nothing more needs
- * to be done to satisfy a truncate request. Filesystems with on-disk
- * blocks for example will need to free them in the case of truncate, in
- * that case it may be easier not to use simple_setsize (but each of its
- * components will likely be required at some point to update pagecache
- * and inode etc).
- */
-int simple_setsize(struct inode *inode, loff_t newsize)
-{
-	loff_t oldsize;
-	int error;
-
-	error = inode_newsize_ok(inode, newsize);
-	if (error)
-		return error;
-
-	oldsize = inode->i_size;
-	i_size_write(inode, newsize);
-	truncate_pagecache(inode, oldsize, newsize);
-
-	return error;
-}
-EXPORT_SYMBOL(simple_setsize);
-
-/**
- * simple_setattr - setattr for simple in-memory filesystem
+ * simple_setattr - setattr for simple filesystem
  * @dentry: dentry
  * @iattr: iattr structure
  *
  * Returns 0 on success, -error on failure.
  *
- * simple_setattr implements setattr for an in-memory filesystem which
- * does not store its own file data or metadata (eg. uses the page cache
- * and inode cache as its data store).
+ * simple_setattr is a simple ->setattr implementation without a proper
+ * implementation of size changes.
+ *
+ * It can either be used for in-memory filesystems or special files
+ * on simple regular filesystems.  Anything that needs to change on-disk
+ * or wire state on size changes needs its own setattr method.
  */
 int simple_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	WARN_ON_ONCE(inode->i_op->truncate);
+
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		return error;
 
-	if (iattr->ia_valid & ATTR_SIZE) {
-		error = simple_setsize(inode, iattr->ia_size);
-		if (error)
-			return error;
-	}
-
-	generic_setattr(inode, iattr);
-
-	return error;
+	if (iattr->ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, iattr->ia_size);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 EXPORT_SYMBOL(simple_setattr);
 
@@ -934,10 +890,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
  */
 int generic_file_fsync(struct file *file, int datasync)
 {
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0, /* metadata-only; caller takes care of data */
-	};
 	struct inode *inode = file->f_mapping->host;
 	int err;
 	int ret;
@@ -948,13 +900,42 @@ int generic_file_fsync(struct file *file, int datasync)
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		return ret;
 
-	err = sync_inode(inode, &wbc);
+	err = sync_inode_metadata(inode, 1);
 	if (ret == 0)
 		ret = err;
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_fsync);
 
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:	log of file system block size
+ * @num_blocks:		number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+	u64 last_fs_block = num_blocks - 1;
+	u64 last_fs_page =
+		last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+	if (unlikely(num_blocks == 0))
+		return 0;
+
+	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+		return -EINVAL;
+
+	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+	    (last_fs_page > (pgoff_t)(~0ULL))) {
+		return -EFBIG;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
 /*
  * No-op implementation of ->fsync for in-memory filesystems.
  */
@@ -968,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(mount_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993..d5bb86866e6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
 };
 
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 
 /**
  * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
 		block->b_lock = fl;
 		init_waitqueue_head(&block->b_wait);
 		block->b_status = nlm_lck_blocked;
+
+		spin_lock(&nlm_blocked_lock);
 		list_add(&block->b_list, &nlm_blocked);
+		spin_unlock(&nlm_blocked_lock);
 	}
 	return block;
 }
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
 {
 	if (block == NULL)
 		return;
+	spin_lock(&nlm_blocked_lock);
 	list_del(&block->b_list);
+	spin_unlock(&nlm_blocked_lock);
 	kfree(block);
 }
 
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 	 * Look up blocked request based on arguments. 
 	 * Warning: must not use cookie to match it!
 	 */
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		struct file_lock *fl_blocked = block->b_lock;
 
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		wake_up(&block->b_wait);
 		res = nlm_granted;
 	}
+	spin_unlock(&nlm_blocked_lock);
 	return res;
 }
 
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
 	allow_signal(SIGKILL);
 
 	down_write(&host->h_rwsem);
-
-	/* This one ensures that our parent doesn't terminate while the
-	 * reclaim is in progress */
-	lock_kernel();
 	lockd_up();	/* note: this cannot fail as lockd is already running */
 
 	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
 	dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (block->b_host == host) {
 			block->b_status = nlm_lck_denied_grace_period;
 			wake_up(&block->b_wait);
 		}
 	}
+	spin_unlock(&nlm_blocked_lock);
 
 	/* Release host handle after use */
 	nlm_release_host(host);
 	lockd_down();
-	unlock_kernel();
 	return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab..47ea1e1925b 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
 
-	lock_kernel();
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
 			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 		status = nlmclnt_test(call, fl);
 	else
 		status = -EINVAL;
-
 	fl->fl_ops->fl_release_private(fl);
 	fl->fl_ops = NULL;
-	unlock_kernel();
 
 	dprintk("lockd: clnt proc returns %d\n", status);
 	return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
 
 static void nlmclnt_rpc_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
 
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
 	new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
 	list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
 
 static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
+	spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	list_del(&fl->fl_u.nfs_fl.list);
+	spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
 
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
 	return;
  retry_rebind:
-	lock_kernel();
 	nlm_rebind_host(req->a_host);
-	unlock_kernel();
  retry_unlock:
 	rpc_restart_call(task);
 }
@@ -801,9 +798,7 @@ retry_cancel:
 	/* Don't ever retry more than 3 times */
 	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
 		goto die;
-	lock_kernel();
 	nlm_rebind_host(req->a_host);
-	unlock_kernel();
 	rpc_restart_call(task);
 	rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104..25e21e4023b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
 			.to_retries	= 5U,
 		};
 		struct rpc_create_args args = {
+			.net		= &init_net,
 			.protocol	= host->h_proto,
 			.address	= nlm_addr(host),
 			.addrsize	= host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fba..e0c91894964 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
 		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
 	};
 	struct rpc_create_args args = {
+		.net			= &init_net,
 		.protocol		= XPRT_TRANSPORT_UDP,
 		.address		= (struct sockaddr *)&sin,
 		.addrsize		= sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a039..abfff9d7979 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
 #include <linux/in.h>
 #include <linux/uio.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
 
 	dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
 
-	/*
-	 * FIXME: it would be nice if lockd didn't spend its entire life
-	 * running under the BKL. At the very least, it would be good to
-	 * have someone clarify what it's intended to protect here. I've
-	 * seen some handwavy posts about posix locking needing to be
-	 * done under the BKL, but it's far from clear.
-	 */
-	lock_kernel();
-
 	if (!nlm_timeout)
 		nlm_timeout = LOCKD_DFLT_TIMEO;
 	nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
-	unlock_kernel();
 	return 0;
 }
 
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
 
 	xprt = svc_find_xprt(serv, name, family, 0);
 	if (xprt == NULL)
-		return svc_create_xprt(serv, name, family, port,
+		return svc_create_xprt(serv, name, &init_net, family, port,
 						SVC_SOCK_DEFAULTS);
 	svc_xprt_put(xprt);
 	return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a13..a336e832475 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlm4svc_callback_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc..c462d346acb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
  * The list of blocked locks to retry
  */
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 
 /*
  * Insert a blocked lock into the global list
  */
 static void
-nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
 {
 	struct nlm_block *b;
 	struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
 	block->b_when = when;
 }
 
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+	spin_lock(&nlm_blocked_lock);
+	nlmsvc_insert_block_locked(block, when);
+	spin_unlock(&nlm_blocked_lock);
+}
+
 /*
  * Remove a block from the global list
  */
@@ -94,7 +102,9 @@ static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
 	if (!list_empty(&block->b_list)) {
+		spin_lock(&nlm_blocked_lock);
 		list_del_init(&block->b_list);
+		spin_unlock(&nlm_blocked_lock);
 		nlmsvc_release_block(block);
 	}
 }
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
 	struct nlm_block *block;
 	int rc = -ENOENT;
 
-	lock_kernel();
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
 			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
 			} else if (result == 0)
 				block->b_granted = 1;
 
-			nlmsvc_insert_block(block, 0);
+			nlmsvc_insert_block_locked(block, 0);
 			svc_wake_up(block->b_daemon);
 			rc = 0;
 			break;
 		}
 	}
-	unlock_kernel();
+	spin_unlock(&nlm_blocked_lock);
 	if (rc == -ENOENT)
 		printk(KERN_WARNING "lockd: grant for unknown block\n");
 	return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
 	struct nlm_block	*block;
 
 	dprintk("lockd: VFS unblock notification for block %p\n", fl);
+	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
-			nlmsvc_insert_block(block, 0);
+			nlmsvc_insert_block_locked(block, 0);
+			spin_unlock(&nlm_blocked_lock);
 			svc_wake_up(block->b_daemon);
 			return;
 		}
 	}
-
+	spin_unlock(&nlm_blocked_lock);
 	printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
 
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 
 	dprintk("lockd: GRANT_MSG RPC callback\n");
 
-	lock_kernel();
+	spin_lock(&nlm_blocked_lock);
 	/* if the block is not on a list at this point then it has
 	 * been invalidated. Don't try to requeue it.
 	 *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 		/* Call was successful, now wait for client callback */
 		timeout = 60 * HZ;
 	}
-	nlmsvc_insert_block(block, timeout);
+	nlmsvc_insert_block_locked(block, timeout);
 	svc_wake_up(block->b_daemon);
 out:
-	unlock_kernel();
+	spin_unlock(&nlm_blocked_lock);
 }
 
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
 static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
-
-	lock_kernel();
 	nlmsvc_release_block(call->a_block);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7..c3069f38d60 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlmsvc_callback_release(void *data)
 {
-	lock_kernel();
 	nlm_release_call(data);
-	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3d..1ca0679c80b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 
 again:
 	file->f_locks = 0;
+	lock_flocks(); /* protects i_flock list */
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -181,6 +182,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
+			unlock_flocks();
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
 			goto again;
 		}
 	}
+	unlock_flocks();
 
 	return 0;
 }
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
+	lock_flocks();
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-		if (fl->fl_lmops == &nlmsvc_lock_operations)
+		if (fl->fl_lmops == &nlmsvc_lock_operations) {
+			unlock_flocks();
 			return 1;
+		}
 	}
+	unlock_flocks();
 	file->f_locks = 0;
 	return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc04..65765cb6afe 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,14 +142,32 @@ int lease_break_time = 45;
 
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
+
+/*
+ * Protects the two list heads above, plus the inode->i_flock list
+ * FIXME: should use a spinlock, once lockd and ceph are ready.
+ */
+void lock_flocks(void)
+{
+	spin_lock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(lock_flocks);
+
+void unlock_flocks(void)
+{
+	spin_unlock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(unlock_flocks);
 
 static struct kmem_cache *filelock_cache __read_mostly;
 
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(void)
 {
 	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
 
 void locks_release_private(struct file_lock *fl)
 {
@@ -168,7 +186,7 @@ void locks_release_private(struct file_lock *fl)
 EXPORT_SYMBOL_GPL(locks_release_private);
 
 /* Free a lock which is not in use. */
-static void locks_free_lock(struct file_lock *fl)
+void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
 	BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl)
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
 }
+EXPORT_SYMBOL(locks_free_lock);
 
 void locks_init_lock(struct file_lock *fl)
 {
@@ -216,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
 			fl->fl_ops->fl_copy_lock(new, fl);
 		new->fl_ops = fl->fl_ops;
 	}
-	if (fl->fl_lmops) {
-		if (fl->fl_lmops->fl_copy_lock)
-			fl->fl_lmops->fl_copy_lock(new, fl);
+	if (fl->fl_lmops)
 		new->fl_lmops = fl->fl_lmops;
-	}
 }
 
 /*
@@ -511,9 +527,9 @@ static void __locks_delete_block(struct file_lock *waiter)
  */
 static void locks_delete_block(struct file_lock *waiter)
 {
-	lock_kernel();
+	lock_flocks();
 	__locks_delete_block(waiter);
-	unlock_kernel();
+	unlock_flocks();
 }
 
 /* Insert waiter into blocker's block list.
@@ -644,7 +660,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
 
-	lock_kernel();
+	lock_flocks();
 	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
@@ -657,7 +673,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 			fl->fl_pid = pid_vnr(cfl->fl_nspid);
 	} else
 		fl->fl_type = F_UNLCK;
-	unlock_kernel();
+	unlock_flocks();
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +746,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	int error = 0;
 	int found = 0;
 
-	lock_kernel();
-	if (request->fl_flags & FL_ACCESS)
-		goto find_conflict;
-
-	if (request->fl_type != F_UNLCK) {
-		error = -ENOMEM;
+	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
 		new_fl = locks_alloc_lock();
-		if (new_fl == NULL)
-			goto out;
-		error = 0;
+		if (!new_fl)
+			return -ENOMEM;
 	}
 
+	lock_flocks();
+	if (request->fl_flags & FL_ACCESS)
+		goto find_conflict;
+
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
 		if (IS_POSIX(fl))
@@ -767,8 +781,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * If a higher-priority process was blocked on the old file lock,
 	 * give it the opportunity to lock the file.
 	 */
-	if (found)
+	if (found) {
+		unlock_flocks();
 		cond_resched();
+		lock_flocks();
+	}
 
 find_conflict:
 	for_each_lock(inode, before) {
@@ -794,7 +811,7 @@ find_conflict:
 	error = 0;
 
 out:
-	unlock_kernel();
+	unlock_flocks();
 	if (new_fl)
 		locks_free_lock(new_fl);
 	return error;
@@ -823,7 +840,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	lock_kernel();
+	lock_flocks();
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
 			fl = *before;
@@ -991,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	unlock_kernel();
+	unlock_flocks();
 	/*
 	 * Free any unused locks.
 	 */
@@ -1066,14 +1083,14 @@ int locks_mandatory_locked(struct inode *inode)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!IS_POSIX(fl))
 			continue;
 		if (fl->fl_owner != owner)
 			break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return fl ? -EAGAIN : 0;
 }
 
@@ -1186,7 +1203,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 
 	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 
-	lock_kernel();
+	lock_flocks();
 
 	time_out_leases(inode);
 
@@ -1247,8 +1264,10 @@ restart:
 			break_time++;
 	}
 	locks_insert_block(flock, new_fl);
+	unlock_flocks();
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
+	lock_flocks();
 	__locks_delete_block(new_fl);
 	if (error >= 0) {
 		if (error == 0)
@@ -1263,7 +1282,7 @@ restart:
 	}
 
 out:
-	unlock_kernel();
+	unlock_flocks();
 	if (!IS_ERR(new_fl))
 		locks_free_lock(new_fl);
 	return error;
@@ -1319,7 +1338,7 @@ int fcntl_getlease(struct file *filp)
 	struct file_lock *fl;
 	int type = F_UNLCK;
 
-	lock_kernel();
+	lock_flocks();
 	time_out_leases(filp->f_path.dentry->d_inode);
 	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
@@ -1328,7 +1347,7 @@ int fcntl_getlease(struct file *filp)
 			break;
 		}
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return type;
 }
 
@@ -1341,36 +1360,32 @@ int fcntl_getlease(struct file *filp)
  *	The (input) flp->fl_lmops->fl_break function is required
  *	by break_lease().
  *
- *	Called with kernel lock held.
+ *	Called with file_lock_lock held.
  */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
-	struct file_lock *new_fl = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
 
+	lease = *flp;
+
+	error = -EACCES;
 	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-		return -EACCES;
+		goto out;
+	error = -EINVAL;
 	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
+		goto out;
 	error = security_file_lock(filp, arg);
 	if (error)
-		return error;
+		goto out;
 
 	time_out_leases(inode);
 
 	BUG_ON(!(*flp)->fl_lmops->fl_break);
 
-	lease = *flp;
-
 	if (arg != F_UNLCK) {
-		error = -ENOMEM;
-		new_fl = locks_alloc_lock();
-		if (new_fl == NULL)
-			goto out;
-
 		error = -EAGAIN;
 		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 			goto out;
@@ -1410,12 +1425,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 
 	if (my_before != NULL) {
-		*flp = *my_before;
 		error = lease->fl_lmops->fl_change(my_before, arg);
+		if (!error)
+			*flp = *my_before;
 		goto out;
 	}
 
-	error = 0;
 	if (arg == F_UNLCK)
 		goto out;
 
@@ -1423,20 +1438,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	if (!leases_enable)
 		goto out;
 
-	locks_copy_lock(new_fl, lease);
-	locks_insert_lock(before, new_fl);
-
-	*flp = new_fl;
+	locks_insert_lock(before, lease);
 	return 0;
 
 out:
-	if (new_fl != NULL)
-		locks_free_lock(new_fl);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
 
- /**
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+	if (filp->f_op && filp->f_op->setlease)
+		return filp->f_op->setlease(filp, arg, lease);
+	else
+		return generic_setlease(filp, arg, lease);
+}
+
+/**
  *	vfs_setlease        -       sets a lease on an open file
  *	@filp: file pointer
  *	@arg: type of lease to obtain
@@ -1467,60 +1485,90 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
 	int error;
 
-	lock_kernel();
-	if (filp->f_op && filp->f_op->setlease)
-		error = filp->f_op->setlease(filp, arg, lease);
-	else
-		error = generic_setlease(filp, arg, lease);
-	unlock_kernel();
+	lock_flocks();
+	error = __vfs_setlease(filp, arg, lease);
+	unlock_flocks();
 
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
 
-/**
- *	fcntl_setlease	-	sets a lease on an open file
- *	@fd: open file descriptor
- *	@filp: file pointer
- *	@arg: type of lease to obtain
- *
- *	Call this fcntl to establish a lease on the file.
- *	Note that you also need to call %F_SETSIG to
- *	receive a signal when the lease is broken.
- */
-int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+static int do_fcntl_delete_lease(struct file *filp)
 {
 	struct file_lock fl, *flp = &fl;
+
+	lease_init(filp, F_UNLCK, flp);
+
+	return vfs_setlease(filp, F_UNLCK, &flp);
+}
+
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+{
+	struct file_lock *fl;
+	struct fasync_struct *new;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int error;
 
-	locks_init_lock(&fl);
-	error = lease_init(filp, arg, &fl);
-	if (error)
-		return error;
+	fl = lease_alloc(filp, arg);
+	if (IS_ERR(fl))
+		return PTR_ERR(fl);
 
-	lock_kernel();
+	new = fasync_alloc();
+	if (!new) {
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
+	lock_flocks();
+	error = __vfs_setlease(filp, arg, &fl);
+	if (error) {
+		unlock_flocks();
+		locks_free_lock(fl);
+		goto out_free_fasync;
+	}
 
-	error = vfs_setlease(filp, arg, &flp);
-	if (error || arg == F_UNLCK)
-		goto out_unlock;
+	/*
+	 * fasync_insert_entry() returns the old entry if any.
+	 * If there was no old entry, then it used 'new' and
+	 * inserted it into the fasync list. Clear new so that
+	 * we don't release it here.
+	 */
+	if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+		new = NULL;
 
-	error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
 	if (error < 0) {
 		/* remove lease just inserted by setlease */
-		flp->fl_type = F_UNLCK | F_INPROGRESS;
-		flp->fl_break_time = jiffies - 10;
+		fl->fl_type = F_UNLCK | F_INPROGRESS;
+		fl->fl_break_time = jiffies - 10;
 		time_out_leases(inode);
-		goto out_unlock;
+	} else {
+		error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 	}
+	unlock_flocks();
 
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-out_unlock:
-	unlock_kernel();
+out_free_fasync:
+	if (new)
+		fasync_free(new);
 	return error;
 }
 
 /**
+ *	fcntl_setlease	-	sets a lease on an open file
+ *	@fd: open file descriptor
+ *	@filp: file pointer
+ *	@arg: type of lease to obtain
+ *
+ *	Call this fcntl to establish a lease on the file.
+ *	Note that you also need to call %F_SETSIG to
+ *	receive a signal when the lease is broken.
+ */
+int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+{
+	if (arg == F_UNLCK)
+		return do_fcntl_delete_lease(filp);
+	return do_fcntl_add_lease(fd, filp, arg);
+}
+
+/**
  * flock_lock_file_wait - Apply a FLOCK-style lock to a file
  * @filp: The file to apply the lock to
  * @fl: The lock to be applied
@@ -2020,7 +2068,7 @@ void locks_remove_flock(struct file *filp)
 			fl.fl_ops->fl_release_private(&fl);
 	}
 
-	lock_kernel();
+	lock_flocks();
 	before = &inode->i_flock;
 
 	while ((fl = *before) != NULL) {
@@ -2038,7 +2086,7 @@ void locks_remove_flock(struct file *filp)
  		}
 		before = &fl->fl_next;
 	}
-	unlock_kernel();
+	unlock_flocks();
 }
 
 /**
@@ -2053,12 +2101,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
 	int status = 0;
 
-	lock_kernel();
+	lock_flocks();
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
 	else
 		status = -ENOENT;
-	unlock_kernel();
+	unlock_flocks();
 	return status;
 }
 
@@ -2085,7 +2133,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-							int id, char *pfx)
+			    loff_t id, char *pfx)
 {
 	struct inode *inode = NULL;
 	unsigned int fl_pid;
@@ -2098,7 +2146,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	if (fl->fl_file != NULL)
 		inode = fl->fl_file->f_path.dentry->d_inode;
 
-	seq_printf(f, "%d:%s ", id, pfx);
+	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
 		seq_printf(f, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2209,33 @@ static int locks_show(struct seq_file *f, void *v)
 
 	fl = list_entry(v, struct file_lock, fl_link);
 
-	lock_get_status(f, fl, (long)f->private, "");
+	lock_get_status(f, fl, *((loff_t *)f->private), "");
 
 	list_for_each_entry(bfl, &fl->fl_block, fl_block)
-		lock_get_status(f, bfl, (long)f->private, " ->");
+		lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
 
-	f->private++;
 	return 0;
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-	lock_kernel();
-	f->private = (void *)1;
+	loff_t *p = f->private;
+
+	lock_flocks();
+	*p = (*pos + 1);
 	return seq_list_start(&file_lock_list, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+	loff_t *p = f->private;
+	++*p;
 	return seq_list_next(v, &file_lock_list, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
 {
-	unlock_kernel();
+	unlock_flocks();
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2247,14 @@ static const struct seq_operations locks_seq_operations = {
 
 static int locks_open(struct inode *inode, struct file *filp)
 {
-	return seq_open(filp, &locks_seq_operations);
+	return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 
 static const struct file_operations proc_locks_operations = {
 	.open		= locks_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 static int __init proc_locks_init(void)
@@ -2231,7 +2282,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2299,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return result;
 }
 
@@ -2271,7 +2322,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2337,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_kernel();
+	unlock_flocks();
 	return result;
 }
 
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a304..92ca6fbe09b 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 	return sync_request(page, bdev, WRITE);
 }
 
-static void bdev_put_device(struct super_block *sb)
+static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = {
 	.put_device	= bdev_put_device,
 };
 
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt)
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
+		const char *devname)
 {
 	struct block_device *bdev;
 
@@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags,
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
 		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
-		return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
-	return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+	p->s_bdev = bdev;
+	p->s_mtd = NULL;
+	p->s_devops = &bd_devops;
+	return 0;
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4..7466e9dcc8c 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
 	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
 
-static void mtd_put_device(struct super_block *sb)
+static void mtd_put_device(struct logfs_super *s)
 {
-	put_mtd_device(logfs_super(sb)->s_mtd);
+	put_mtd_device(s->s_mtd);
 }
 
 static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
 	.put_device	= mtd_put_device,
 };
 
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt)
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
-	struct mtd_info *mtd;
-	const struct logfs_device_ops *devops = &mtd_devops;
-
-	mtd = get_mtd_device(NULL, mtdnr);
+	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
 	if (IS_ERR(mtd))
 		return PTR_ERR(mtd);
-	return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+
+	s->s_bdev = NULL;
+	s->s_mtd = mtd;
+	s->s_devops = &mtd_devops;
+	return 0;
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 72d1893ddd3..409dfd65e9a 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -434,8 +434,11 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
 	int ret;
 
 	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-	if (!ta)
+	if (!ta) {
+		inode->i_nlink--;
+		iput(inode);
 		return -ENOMEM;
+	}
 
 	ta->state = CREATE_1;
 	ta->ino = inode->i_ino;
@@ -566,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -EMLINK;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_nlink++;
 	mark_inode_dirty_sync(inode);
 
@@ -821,7 +824,8 @@ const struct inode_operations logfs_dir_iops = {
 };
 const struct file_operations logfs_dir_fops = {
 	.fsync		= logfs_fsync,
-	.ioctl		= logfs_ioctl,
+	.unlocked_ioctl	= logfs_ioctl,
 	.readdir	= logfs_readdir,
 	.read		= generic_read_dir,
+	.llseek		= default_llseek,
 };
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index abe1cafbd4c..e86376b87af 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -181,9 +181,9 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
 }
 
 
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		unsigned long arg)
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct logfs_inode *li = logfs_inode(inode);
 	unsigned int oldflags, flags;
 	int err;
@@ -232,15 +232,19 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = dentry->d_inode;
 	int err = 0;
 
-	if (attr->ia_valid & ATTR_SIZE)
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
 		err = logfs_truncate(inode, attr->ia_size);
-	attr->ia_valid &= ~ATTR_SIZE;
+		if (err)
+			return err;
+	}
 
-	if (!err)
-		err = inode_change_ok(inode, attr);
-	if (!err)
-		err = inode_setattr(inode, attr);
-	return err;
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations logfs_reg_iops = {
@@ -251,7 +255,7 @@ const struct file_operations logfs_reg_fops = {
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.fsync		= logfs_fsync,
-	.ioctl		= logfs_ioctl,
+	.unlocked_ioctl	= logfs_ioctl,
 	.llseek		= generic_file_llseek,
 	.mmap		= generic_file_readonly_mmap,
 	.open		= generic_file_open,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index f602e230e16..d8c71ece098 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -235,33 +235,21 @@ static struct inode *logfs_alloc_inode(struct super_block *sb)
  * purpose is to create a new inode that will not trigger the warning if such
  * an inode is still in use.  An ugly hack, no doubt.  Suggections for
  * improvement are welcome.
+ *
+ * AV: that's what ->put_super() is for...
  */
 struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
 {
 	struct inode *inode;
 
-	inode = logfs_alloc_inode(sb);
+	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	inode->i_mode = S_IFREG;
 	inode->i_ino = ino;
-	inode->i_sb = sb;
-
-	/* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
-	 * to be nonstatic, alas. */
-	{
-		struct address_space * const mapping = &inode->i_data;
-
-		mapping->a_ops = &logfs_reg_aops;
-		mapping->host = inode;
-		mapping->flags = 0;
-		mapping_set_gfp_mask(mapping, GFP_NOFS);
-		mapping->assoc_mapping = NULL;
-		mapping->backing_dev_info = &default_backing_dev_info;
-		inode->i_mapping = mapping;
-		inode->i_nlink = 1;
-	}
+	inode->i_data.a_ops = &logfs_reg_aops;
+	mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
 
 	return inode;
 }
@@ -277,7 +265,7 @@ struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
 
 	err = logfs_read_inode(inode);
 	if (err) {
-		destroy_meta_inode(inode);
+		iput(inode);
 		return ERR_PTR(err);
 	}
 	logfs_inode_setops(inode);
@@ -298,18 +286,8 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-void destroy_meta_inode(struct inode *inode)
-{
-	if (inode) {
-		if (inode->i_data.nrpages)
-			truncate_inode_pages(&inode->i_data, 0);
-		logfs_clear_inode(inode);
-		kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
-	}
-}
-
 /* called with inode_lock held */
-static void logfs_drop_inode(struct inode *inode)
+static int logfs_drop_inode(struct inode *inode)
 {
 	struct logfs_super *super = logfs_super(inode->i_sb);
 	struct logfs_inode *li = logfs_inode(inode);
@@ -317,7 +295,7 @@ static void logfs_drop_inode(struct inode *inode)
 	spin_lock(&logfs_inode_lock);
 	list_move(&li->li_freeing_list, &super->s_freeing_list);
 	spin_unlock(&logfs_inode_lock);
-	generic_drop_inode(inode);
+	return generic_drop_inode(inode);
 }
 
 static void logfs_set_ino_generation(struct super_block *sb,
@@ -384,12 +362,21 @@ static int logfs_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static void logfs_put_super(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	/* kill the meta-inodes */
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
+}
+
 const struct super_operations logfs_super_operations = {
 	.alloc_inode	= logfs_alloc_inode,
-	.clear_inode	= logfs_clear_inode,
-	.delete_inode	= logfs_delete_inode,
 	.destroy_inode	= logfs_destroy_inode,
+	.evict_inode	= logfs_evict_inode,
 	.drop_inode	= logfs_drop_inode,
+	.put_super	= logfs_put_super,
 	.write_inode	= logfs_write_inode,
 	.statfs		= logfs_statfs,
 	.sync_fs	= logfs_sync_fs,
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 4b0e0616b35..f46ee8b0e13 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -889,8 +889,6 @@ void logfs_cleanup_journal(struct super_block *sb)
 	struct logfs_super *super = logfs_super(sb);
 
 	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
-	destroy_meta_inode(super->s_master_inode);
-	super->s_master_inode = NULL;
 
 	kfree(super->s_compressed_je);
 	kfree(super->s_je);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index c838c4d7211..cd51a36b37f 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
 	int	(*erase_segment)(struct logfs_area *area);
 };
 
+struct logfs_super;	/* forward */
 /**
  * struct logfs_device_ops - device access operations
  *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
 			int ensure_write);
 	int (*can_write_buf)(struct super_block *sb, u64 ofs);
 	void (*sync)(struct super_block *sb);
-	void (*put_device)(struct super_block *sb);
+	void (*put_device)(struct logfs_super *s);
 };
 
 /**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
 
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt);
+int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname);
 #else
-static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
-		const char *devname, struct vfsmount *mnt)
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
+		struct file_system_type *type,
+		const char *devname)
 {
 	return -ENODEV;
 }
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
 
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt);
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 #else
-static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
-		int mtdnr, struct vfsmount *mnt)
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 {
 	return -ENODEV;
 }
@@ -504,8 +505,7 @@ extern const struct inode_operations logfs_reg_iops;
 extern const struct file_operations logfs_reg_fops;
 extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		unsigned long arg);
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int logfs_fsync(struct file *file, int datasync);
 
 /* gc.c */
@@ -525,13 +525,11 @@ struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
 struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
 int logfs_init_inode_cache(void);
 void logfs_destroy_inode_cache(void);
-void destroy_meta_inode(struct inode *inode);
 void logfs_set_blocks(struct inode *inode, u64 no);
 /* these logically belong into inode.c but actually reside in readwrite.c */
 int logfs_read_inode(struct inode *inode);
 int __logfs_write_inode(struct inode *inode, long flags);
-void logfs_delete_inode(struct inode *inode);
-void logfs_clear_inode(struct inode *inode);
+void logfs_evict_inode(struct inode *inode);
 
 /* journal.c */
 void logfs_write_anchor(struct super_block *sb);
@@ -622,9 +620,6 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct file_system_type *type, int flags,
-		struct mtd_info *mtd, struct block_device *bdev,
-		const struct logfs_device_ops *devops, struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
 
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 0718d112a1a..6127baf0e18 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1972,31 +1972,6 @@ static struct page *inode_to_page(struct inode *inode)
 	return page;
 }
 
-/* Cheaper version of write_inode.  All changes are concealed in
- * aliases, which are moved back.  No write to the medium happens.
- */
-void logfs_clear_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_block *block = li->li_block;
-	struct page *page;
-
-	/* Only deleted files may be dirty at this point */
-	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
-	if (!block)
-		return;
-	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
-		block->ops->free_block(inode->i_sb, block);
-		return;
-	}
-
-	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
-	page = inode_to_page(inode);
-	BUG_ON(!page); /* FIXME: Use emergency page */
-	logfs_put_write_page(page);
-}
-
 static int do_write_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
@@ -2164,18 +2139,40 @@ static int do_delete_inode(struct inode *inode)
  * ZOMBIE inodes have already been deleted before and should remain dead,
  * if it weren't for valid checking.  No need to kill them again here.
  */
-void logfs_delete_inode(struct inode *inode)
+void logfs_evict_inode(struct inode *inode)
 {
+	struct super_block *sb = inode->i_sb;
 	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+	struct page *page;
 
-	if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
-		li->li_flags |= LOGFS_IF_ZOMBIE;
-		if (i_size_read(inode) > 0)
-			logfs_truncate(inode, 0);
-		do_delete_inode(inode);
+	if (!inode->i_nlink) {
+		if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+			li->li_flags |= LOGFS_IF_ZOMBIE;
+			if (i_size_read(inode) > 0)
+				logfs_truncate(inode, 0);
+			do_delete_inode(inode);
+		}
 	}
 	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
+	end_writeback(inode);
+
+	/* Cheaper version of write_inode.  All changes are concealed in
+	 * aliases, which are moved back.  No write to the medium happens.
+	 */
+	/* Only deleted files may be dirty at this point */
+	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+	if (!block)
+		return;
+	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+		block->ops->free_block(inode->i_sb, block);
+		return;
+	}
+
+	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+	page = inode_to_page(inode);
+	BUG_ON(!page); /* FIXME: Use emergency page */
+	logfs_put_write_page(page);
 }
 
 void btree_write_block(struct logfs_block *block)
@@ -2272,7 +2269,6 @@ void logfs_cleanup_rw(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 
-	destroy_meta_inode(super->s_segfile_inode);
 	logfs_mempool_destroy(super->s_block_pool);
 	logfs_mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index a9657afb70a..9d518735325 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -929,5 +929,4 @@ void logfs_cleanup_areas(struct super_block *sb)
 	for_each_area(i)
 		free_area(super->s_area[i]);
 	free_area(super->s_journal_area);
-	destroy_meta_inode(super->s_mapping_inode);
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d651e10a1e9..33435e4b14d 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
 	return 0;
 }
 
-static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+static int logfs_get_sb_final(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct inode *rootdir;
@@ -342,24 +342,26 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 		goto fail;
 	}
 
+	/* at that point we know that ->put_super() will be called */
 	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
 	if (!super->s_erase_page)
-		goto fail;
+		return -ENOMEM;
 	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
 
 	/* FIXME: check for read-only mounts */
 	err = logfs_make_writeable(sb);
-	if (err)
-		goto fail1;
+	if (err) {
+		__free_page(super->s_erase_page);
+		return err;
+	}
 
 	log_super("LogFS: Finished mounting\n");
-	simple_set_mnt(mnt, sb);
 	return 0;
 
-fail1:
-	__free_page(super->s_erase_page);
 fail:
-	iput(logfs_super(sb)->s_master_inode);
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
 	return -EIO;
 }
 
@@ -526,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
 	logfs_cleanup_rw(sb);
 	if (super->s_erase_page)
 		__free_page(super->s_erase_page);
-	super->s_devops->put_device(sb);
+	super->s_devops->put_device(super);
 	logfs_mempool_destroy(super->s_btree_pool);
 	logfs_mempool_destroy(super->s_alias_pool);
 	kfree(super);
 	log_super("LogFS: Finished unmounting\n");
 }
 
-int logfs_get_sb_device(struct file_system_type *type, int flags,
-		struct mtd_info *mtd, struct block_device *bdev,
-		const struct logfs_device_ops *devops, struct vfsmount *mnt)
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
+		struct file_system_type *type, int flags)
 {
-	struct logfs_super *super;
 	struct super_block *sb;
 	int err = -ENOMEM;
 	static int mount_count;
 
 	log_super("LogFS: Start mount %x\n", mount_count++);
-	super = kzalloc(sizeof(*super), GFP_KERNEL);
-	if (!super)
-		goto err0;
 
-	super->s_mtd	= mtd;
-	super->s_bdev	= bdev;
 	err = -EINVAL;
 	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
-	if (IS_ERR(sb))
-		goto err0;
+	if (IS_ERR(sb)) {
+		super->s_devops->put_device(super);
+		kfree(super);
+		return ERR_CAST(sb);
+	}
 
 	if (sb->s_root) {
 		/* Device is already in use */
-		err = 0;
-		simple_set_mnt(mnt, sb);
-		goto err0;
+		super->s_devops->put_device(super);
+		kfree(super);
+		return dget(sb->s_root);
 	}
 
-	super->s_devops = devops;
-
 	/*
 	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
 	 * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -578,44 +574,58 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
 		goto err1;
 
 	sb->s_flags |= MS_ACTIVE;
-	err = logfs_get_sb_final(sb, mnt);
-	if (err)
-		goto err1;
-	return 0;
+	err = logfs_get_sb_final(sb);
+	if (err) {
+		deactivate_locked_super(sb);
+		return ERR_PTR(err);
+	}
+	return dget(sb->s_root);
 
 err1:
+	/* no ->s_root, no ->put_super() */
+	iput(super->s_master_inode);
+	iput(super->s_segfile_inode);
+	iput(super->s_mapping_inode);
 	deactivate_locked_super(sb);
-	return err;
-err0:
-	kfree(super);
-	//devops->put_device(sb);
-	return err;
+	return ERR_PTR(err);
 }
 
-static int logfs_get_sb(struct file_system_type *type, int flags,
-		const char *devname, void *data, struct vfsmount *mnt)
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
+		const char *devname, void *data)
 {
 	ulong mtdnr;
+	struct logfs_super *super;
+	int err;
 
-	if (!devname)
-		return logfs_get_sb_bdev(type, flags, devname, mnt);
-	if (strncmp(devname, "mtd", 3))
-		return logfs_get_sb_bdev(type, flags, devname, mnt);
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return ERR_PTR(-ENOMEM);
 
-	{
+	if (!devname)
+		err = logfs_get_sb_bdev(super, type, devname);
+	else if (strncmp(devname, "mtd", 3))
+		err = logfs_get_sb_bdev(super, type, devname);
+	else {
 		char *garbage;
 		mtdnr = simple_strtoul(devname+3, &garbage, 0);
 		if (*garbage)
-			return -EINVAL;
+			err = -EINVAL;
+		else
+			err = logfs_get_sb_mtd(super, mtdnr);
+	}
+
+	if (err) {
+		kfree(super);
+		return ERR_PTR(err);
 	}
 
-	return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+	return logfs_get_sb_device(super, type, flags);
 }
 
 static struct file_system_type logfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "logfs",
-	.get_sb		= logfs_get_sb,
+	.mount		= logfs_mount,
 	.kill_sb	= logfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e28f21b9534..93444747237 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,12 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
 struct mb_cache {
 	struct list_head		c_cache_list;
 	const char			*c_name;
-	struct mb_cache_op		c_op;
 	atomic_t			c_entry_count;
+	int				c_max_entries;
 	int				c_bucket_bits;
-#ifndef MB_CACHE_INDEXES_COUNT
-	int				c_indexes_count;
-#endif
-	struct kmem_cache			*c_entry_cache;
+	struct kmem_cache		*c_entry_cache;
 	struct list_head		*c_block_hash;
-	struct list_head		*c_indexes_hash[0];
+	struct list_head		*c_index_hash;
 };
 
 
@@ -101,16 +98,6 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
-static inline int
-mb_cache_indexes(struct mb_cache *cache)
-{
-#ifdef MB_CACHE_INDEXES_COUNT
-	return MB_CACHE_INDEXES_COUNT;
-#else
-	return cache->c_indexes_count;
-#endif
-}
-
 /*
  * What the mbcache registers as to get shrunk dynamically.
  */
@@ -132,12 +119,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 static void
 __mb_cache_entry_unhash(struct mb_cache_entry *ce)
 {
-	int n;
-
 	if (__mb_cache_entry_is_hashed(ce)) {
 		list_del_init(&ce->e_block_list);
-		for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
-			list_del(&ce->e_indexes[n].o_list);
+		list_del(&ce->e_index.o_list);
 	}
 }
 
@@ -148,16 +132,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 	struct mb_cache *cache = ce->e_cache;
 
 	mb_assert(!(ce->e_used || ce->e_queued));
-	if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
-		/* free failed -- put back on the lru list
-		   for freeing later. */
-		spin_lock(&mb_cache_spinlock);
-		list_add(&ce->e_lru_list, &mb_cache_lru_list);
-		spin_unlock(&mb_cache_spinlock);
-	} else {
-		kmem_cache_free(cache->c_entry_cache, ce);
-		atomic_dec(&cache->c_entry_count);
-	}
+	kmem_cache_free(cache->c_entry_cache, ce);
+	atomic_dec(&cache->c_entry_count);
 }
 
 
@@ -201,22 +177,12 @@ static int
 mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(free_list);
-	struct list_head *l, *ltmp;
+	struct mb_cache *cache;
+	struct mb_cache_entry *entry, *tmp;
 	int count = 0;
 
-	spin_lock(&mb_cache_spinlock);
-	list_for_each(l, &mb_cache_list) {
-		struct mb_cache *cache =
-			list_entry(l, struct mb_cache, c_cache_list);
-		mb_debug("cache %s (%d)", cache->c_name,
-			  atomic_read(&cache->c_entry_count));
-		count += atomic_read(&cache->c_entry_count);
-	}
 	mb_debug("trying to free %d entries", nr_to_scan);
-	if (nr_to_scan == 0) {
-		spin_unlock(&mb_cache_spinlock);
-		goto out;
-	}
+	spin_lock(&mb_cache_spinlock);
 	while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
 		struct mb_cache_entry *ce =
 			list_entry(mb_cache_lru_list.next,
@@ -224,12 +190,15 @@ mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 		list_move_tail(&ce->e_lru_list, &free_list);
 		__mb_cache_entry_unhash(ce);
 	}
+	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+		mb_debug("cache %s (%d)", cache->c_name,
+			  atomic_read(&cache->c_entry_count));
+		count += atomic_read(&cache->c_entry_count);
+	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_safe(l, ltmp, &free_list) {
-		__mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-						   e_lru_list), gfp_mask);
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
 	}
-out:
 	return (count / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -243,72 +212,55 @@ out:
  * memory was available.
  *
  * @name: name of the cache (informal)
- * @cache_op: contains the callback called when freeing a cache entry
- * @entry_size: The size of a cache entry, including
- *              struct mb_cache_entry
- * @indexes_count: number of additional indexes in the cache. Must equal
- *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
- *                 hardwired.
  * @bucket_bits: log2(number of hash buckets)
  */
 struct mb_cache *
-mb_cache_create(const char *name, struct mb_cache_op *cache_op,
-		size_t entry_size, int indexes_count, int bucket_bits)
+mb_cache_create(const char *name, int bucket_bits)
 {
-	int m=0, n, bucket_count = 1 << bucket_bits;
+	int n, bucket_count = 1 << bucket_bits;
 	struct mb_cache *cache = NULL;
 
-	if(entry_size < sizeof(struct mb_cache_entry) +
-	   indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
-		return NULL;
-
-	cache = kmalloc(sizeof(struct mb_cache) +
-	                indexes_count * sizeof(struct list_head), GFP_KERNEL);
+	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)
-		goto fail;
+		return NULL;
 	cache->c_name = name;
-	cache->c_op.free = NULL;
-	if (cache_op)
-		cache->c_op.free = cache_op->free;
 	atomic_set(&cache->c_entry_count, 0);
 	cache->c_bucket_bits = bucket_bits;
-#ifdef MB_CACHE_INDEXES_COUNT
-	mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-#else
-	cache->c_indexes_count = indexes_count;
-#endif
 	cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
 	                              GFP_KERNEL);
 	if (!cache->c_block_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
 		INIT_LIST_HEAD(&cache->c_block_hash[n]);
-	for (m=0; m<indexes_count; m++) {
-		cache->c_indexes_hash[m] = kmalloc(bucket_count *
-		                                 sizeof(struct list_head),
-		                                 GFP_KERNEL);
-		if (!cache->c_indexes_hash[m])
-			goto fail;
-		for (n=0; n<bucket_count; n++)
-			INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
-	}
-	cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
+	cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+				      GFP_KERNEL);
+	if (!cache->c_index_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_LIST_HEAD(&cache->c_index_hash[n]);
+	cache->c_entry_cache = kmem_cache_create(name,
+		sizeof(struct mb_cache_entry), 0,
 		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!cache->c_entry_cache)
-		goto fail;
+		goto fail2;
+
+	/*
+	 * Set an upper limit on the number of cache entries so that the hash
+	 * chains won't grow too long.
+	 */
+	cache->c_max_entries = bucket_count << 4;
 
 	spin_lock(&mb_cache_spinlock);
 	list_add(&cache->c_cache_list, &mb_cache_list);
 	spin_unlock(&mb_cache_spinlock);
 	return cache;
 
+fail2:
+	kfree(cache->c_index_hash);
+
 fail:
-	if (cache) {
-		while (--m >= 0)
-			kfree(cache->c_indexes_hash[m]);
-		kfree(cache->c_block_hash);
-		kfree(cache);
-	}
+	kfree(cache->c_block_hash);
+	kfree(cache);
 	return NULL;
 }
 
@@ -357,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
 {
 	LIST_HEAD(free_list);
 	struct list_head *l, *ltmp;
-	int n;
 
 	spin_lock(&mb_cache_spinlock);
 	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -384,13 +335,11 @@ mb_cache_destroy(struct mb_cache *cache)
 
 	kmem_cache_destroy(cache->c_entry_cache);
 
-	for (n=0; n < mb_cache_indexes(cache); n++)
-		kfree(cache->c_indexes_hash[n]);
+	kfree(cache->c_index_hash);
 	kfree(cache->c_block_hash);
 	kfree(cache);
 }
 
-
 /*
  * mb_cache_entry_alloc()
  *
@@ -402,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-	struct mb_cache_entry *ce;
+	struct mb_cache_entry *ce = NULL;
 
-	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-	if (ce) {
+	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		spin_lock(&mb_cache_spinlock);
+		if (!list_empty(&mb_cache_lru_list)) {
+			ce = list_entry(mb_cache_lru_list.next,
+					struct mb_cache_entry, e_lru_list);
+			list_del_init(&ce->e_lru_list);
+			__mb_cache_entry_unhash(ce);
+		}
+		spin_unlock(&mb_cache_spinlock);
+	}
+	if (!ce) {
+		ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+		if (!ce)
+			return NULL;
 		atomic_inc(&cache->c_entry_count);
 		INIT_LIST_HEAD(&ce->e_lru_list);
 		INIT_LIST_HEAD(&ce->e_block_list);
 		ce->e_cache = cache;
-		ce->e_used = 1 + MB_CACHE_WRITER;
 		ce->e_queued = 0;
 	}
+	ce->e_used = 1 + MB_CACHE_WRITER;
 	return ce;
 }
 
@@ -429,17 +390,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
  *
  * @bdev: device the cache entry belongs to
  * @block: block number
- * @keys: array of additional keys. There must be indexes_count entries
- *        in the array (as specified when creating the cache).
+ * @key: lookup key
  */
 int
 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
-		      sector_t block, unsigned int keys[])
+		      sector_t block, unsigned int key)
 {
 	struct mb_cache *cache = ce->e_cache;
 	unsigned int bucket;
 	struct list_head *l;
-	int error = -EBUSY, n;
+	int error = -EBUSY;
 
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
 			   cache->c_bucket_bits);
@@ -454,12 +414,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 	ce->e_bdev = bdev;
 	ce->e_block = block;
 	list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
-	for (n=0; n<mb_cache_indexes(cache); n++) {
-		ce->e_indexes[n].o_key = keys[n];
-		bucket = hash_long(keys[n], cache->c_bucket_bits);
-		list_add(&ce->e_indexes[n].o_list,
-			 &cache->c_indexes_hash[n][bucket]);
-	}
+	ce->e_index.o_key = key;
+	bucket = hash_long(key, cache->c_bucket_bits);
+	list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
 	error = 0;
 out:
 	spin_unlock(&mb_cache_spinlock);
@@ -555,13 +512,12 @@ cleanup:
 
 static struct mb_cache_entry *
 __mb_cache_entry_find(struct list_head *l, struct list_head *head,
-		      int index, struct block_device *bdev, unsigned int key)
+		      struct block_device *bdev, unsigned int key)
 {
 	while (l != head) {
 		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry,
-			           e_indexes[index].o_list);
-		if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
+			list_entry(l, struct mb_cache_entry, e_index.o_list);
+		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
 			DEFINE_WAIT(wait);
 
 			if (!list_empty(&ce->e_lru_list))
@@ -603,23 +559,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
  * returned cache entry is locked for shared access ("multiple readers").
  *
  * @cache: the cache to search
- * @index: the number of the additonal index to search (0<=index<indexes_count)
  * @bdev: the device the cache entry should belong to
  * @key: the key in the index
  */
 struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, int index,
-			  struct block_device *bdev, unsigned int key)
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+			  unsigned int key)
 {
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
 	struct list_head *l;
 	struct mb_cache_entry *ce;
 
-	mb_assert(index < mb_cache_indexes(cache));
 	spin_lock(&mb_cache_spinlock);
-	l = cache->c_indexes_hash[index][bucket].next;
-	ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-	                           index, bdev, key);
+	l = cache->c_index_hash[bucket].next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
 	spin_unlock(&mb_cache_spinlock);
 	return ce;
 }
@@ -640,12 +593,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
  * }
  *
  * @prev: The previous match
- * @index: the number of the additonal index to search (0<=index<indexes_count)
  * @bdev: the device the cache entry should belong to
  * @key: the key in the index
  */
 struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
 			 struct block_device *bdev, unsigned int key)
 {
 	struct mb_cache *cache = prev->e_cache;
@@ -653,11 +605,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
 	struct list_head *l;
 	struct mb_cache_entry *ce;
 
-	mb_assert(index < mb_cache_indexes(cache));
 	spin_lock(&mb_cache_spinlock);
-	l = prev->e_indexes[index].o_list.next;
-	ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-	                           index, bdev, key);
+	l = prev->e_index.o_list.next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
 	__mb_cache_entry_release_unlock(prev);
 	return ce;
 }
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 482779fe4e7..3f32bcb0d9b 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -200,13 +200,13 @@ void minix_free_inode(struct inode * inode)
 	ino = inode->i_ino;
 	if (ino < 1 || ino > sbi->s_ninodes) {
 		printk("minix_free_inode: inode 0 or nonexistent inode\n");
-		goto out;
+		return;
 	}
 	bit = ino & ((1<<k) - 1);
 	ino >>= k;
 	if (ino >= sbi->s_imap_blocks) {
 		printk("minix_free_inode: nonexistent imap in superblock\n");
-		goto out;
+		return;
 	}
 
 	minix_clear_inode(inode);	/* clear on-disk copy */
@@ -217,8 +217,6 @@ void minix_free_inode(struct inode * inode)
 		printk("minix_free_inode: bit %lu already cleared\n", bit);
 	spin_unlock(&bitmap_lock);
 	mark_buffer_dirty(bh);
- out:
-	clear_inode(inode);		/* clear in-memory copy */
 }
 
 struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 1dbf921ca44..085a9262c69 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -271,8 +271,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
 
 got_it:
 	pos = page_offset(page) + p - (char *)page_address(page);
-	err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
-					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
 	if (err)
 		goto out_unlock;
 	memcpy (namx, name, namelen);
@@ -297,8 +296,7 @@ out_unlock:
 
 int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = (struct inode*)mapping->host;
+	struct inode *inode = page->mapping->host;
 	char *kaddr = page_address(page);
 	loff_t pos = page_offset(page) + (char*)de - kaddr;
 	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
@@ -306,8 +304,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 	int err;
 
 	lock_page(page);
-	err = __minix_write_begin(NULL, mapping, pos, len,
-					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = minix_prepare_chunk(page, pos, len);
 	if (err == 0) {
 		if (sbi->s_version == MINIX_V3)
 			((minix3_dirent *) de)->inode = 0;
@@ -325,16 +322,14 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 
 int minix_make_empty(struct inode *inode, struct inode *dir)
 {
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
 	struct minix_sb_info *sbi = minix_sb(inode->i_sb);
 	char *kaddr;
 	int err;
 
 	if (!page)
 		return -ENOMEM;
-	err = __minix_write_begin(NULL, mapping, 0, 2 * sbi->s_dirsize,
-					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
 	if (err) {
 		unlock_page(page);
 		goto fail;
@@ -425,8 +420,7 @@ not_empty:
 void minix_set_link(struct minix_dir_entry *de, struct page *page,
 	struct inode *inode)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *dir = mapping->host;
+	struct inode *dir = page->mapping->host;
 	struct minix_sb_info *sbi = minix_sb(dir->i_sb);
 	loff_t pos = page_offset(page) +
 			(char *)de-(char*)page_address(page);
@@ -434,8 +428,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
 
 	lock_page(page);
 
-	err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
-					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
 	if (err == 0) {
 		if (sbi->s_version == MINIX_V3)
 			((minix3_dirent *) de)->inode = inode->i_ino;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index d5320ff23fa..4493ce695ab 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -23,7 +23,29 @@ const struct file_operations minix_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
+static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
 const struct inode_operations minix_file_inode_operations = {
 	.truncate	= minix_truncate,
+	.setattr	= minix_setattr,
 	.getattr	= minix_getattr,
 };
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 756f8c93780..fb2020858a3 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,12 +24,17 @@ static int minix_write_inode(struct inode *inode,
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
-static void minix_delete_inode(struct inode *inode)
+static void minix_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
-	inode->i_size = 0;
-	minix_truncate(inode);
-	minix_free_inode(inode);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		minix_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	if (!inode->i_nlink)
+		minix_free_inode(inode);
 }
 
 static void minix_put_super(struct super_block *sb)
@@ -96,7 +101,7 @@ static const struct super_operations minix_sops = {
 	.alloc_inode	= minix_alloc_inode,
 	.destroy_inode	= minix_destroy_inode,
 	.write_inode	= minix_write_inode,
-	.delete_inode	= minix_delete_inode,
+	.evict_inode	= minix_evict_inode,
 	.put_super	= minix_put_super,
 	.statfs		= minix_statfs,
 	.remount_fs	= minix_remount,
@@ -357,20 +362,26 @@ static int minix_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page,minix_get_block);
 }
 
-int __minix_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
+int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				minix_get_block);
+	return __block_write_begin(page, pos, len, minix_get_block);
 }
 
 static int minix_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return __minix_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				minix_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t minix_bmap(struct address_space *mapping, sector_t block)
@@ -603,17 +614,16 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static int minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *minix_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
 }
 
 static struct file_system_type minix_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "minix",
-	.get_sb		= minix_get_sb,
+	.mount		= minix_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 111f34ee9e3..407b1c84911 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -53,9 +53,7 @@ extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
 extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int __minix_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata);
+extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index e20ee85955d..c0d35a3acce 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	return add_nondir(dentry, inode);
 }
 
@@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
 
 	inode_inc_link_count(dir);
 
-	inode = minix_new_inode(dir, mode, &err);
+	inode = minix_new_inode(dir, S_IFDIR | mode, &err);
 	if (!inode)
 		goto out_dir;
 
diff --git a/fs/namei.c b/fs/namei.c
index 868d0cb9d47..5362af9b737 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
 	if (retval)
 		return retval;
 
-	return security_inode_permission(inode,
-			mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
+	return security_inode_permission(inode, mask);
 }
 
 /**
@@ -484,13 +483,8 @@ ok:
 
 static __always_inline void set_root(struct nameidata *nd)
 {
-	if (!nd->root.mnt) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->root = fs->root;
-		path_get(&nd->root);
-		read_unlock(&fs->lock);
-	}
+	if (!nd->root.mnt)
+		get_fs_root(current->fs, &nd->root);
 }
 
 static int link_path_walk(const char *, struct nameidata *);
@@ -601,15 +595,16 @@ int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	parent = path->mnt->mnt_parent;
 	if (parent == path->mnt) {
-		spin_unlock(&vfsmount_lock);
+		br_read_unlock(vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
 	mountpoint = dget(path->mnt->mnt_mountpoint);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -692,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 }
 
 /*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+				struct qstr *name, struct nameidata *nd)
+{
+	struct inode *inode = parent->d_inode;
+	struct dentry *dentry;
+	struct dentry *old;
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(inode)))
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc(parent, name);
+	if (unlikely(!dentry))
+		return ERR_PTR(-ENOMEM);
+
+	old = inode->i_op->lookup(inode, dentry, nd);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	}
+	return dentry;
+}
+
+/*
  *  It's more convoluted than I'd like it to be, but... it's still fairly
  *  small and for now I'd prefer to have fast path as straight as possible.
  *  It _is_ time-critical.
@@ -712,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return err;
 	}
 
+	/*
+	 * Rename seqlock is not required here because in the off chance
+	 * of a false negative due to a concurrent rename, we're going to
+	 * do the non-racy lookup, below.
+	 */
 	dentry = __d_lookup(nd->path.dentry, name);
 	if (!dentry)
 		goto need_lookup;
+found:
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto need_revalidate;
 done:
@@ -730,56 +760,28 @@ need_lookup:
 	mutex_lock(&dir->i_mutex);
 	/*
 	 * First re-do the cached lookup just in case it was created
-	 * while we waited for the directory semaphore..
+	 * while we waited for the directory semaphore, or the first
+	 * lookup failed due to an unrelated rename.
 	 *
-	 * FIXME! This could use version numbering or similar to
-	 * avoid unnecessary cache lookups.
-	 *
-	 * The "dcache_lock" is purely to protect the RCU list walker
-	 * from concurrent renames at this point (we mustn't get false
-	 * negatives from the RCU list walk here, unlike the optimistic
-	 * fast walk).
-	 *
-	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+	 * This could use version numbering or similar to avoid unnecessary
+	 * cache lookups, but then we'd have to do the first lookup in the
+	 * non-racy way. However in the common case here, everything should
+	 * be hot in cache, so would it be a big win?
 	 */
 	dentry = d_lookup(parent, name);
-	if (!dentry) {
-		struct dentry *new;
-
-		/* Don't create child dentry for a dead directory. */
-		dentry = ERR_PTR(-ENOENT);
-		if (IS_DEADDIR(dir))
-			goto out_unlock;
-
-		new = d_alloc(parent, name);
-		dentry = ERR_PTR(-ENOMEM);
-		if (new) {
-			dentry = dir->i_op->lookup(dir, new, nd);
-			if (dentry)
-				dput(new);
-			else
-				dentry = new;
-		}
-out_unlock:
+	if (likely(!dentry)) {
+		dentry = d_alloc_and_lookup(parent, name, nd);
 		mutex_unlock(&dir->i_mutex);
 		if (IS_ERR(dentry))
 			goto fail;
 		goto done;
 	}
-
 	/*
 	 * Uhhuh! Nasty case: the cache was re-populated while
 	 * we waited on the semaphore. Need to revalidate.
 	 */
 	mutex_unlock(&dir->i_mutex);
-	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		dentry = do_revalidate(dentry, nd);
-		if (!dentry)
-			dentry = ERR_PTR(-ENOENT);
-	}
-	if (IS_ERR(dentry))
-		goto fail;
-	goto done;
+	goto found;
 
 need_revalidate:
 	dentry = do_revalidate(dentry, nd);
@@ -1016,11 +1018,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 		nd->path = nd->root;
 		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->path = fs->pwd;
-		path_get(&fs->pwd);
-		read_unlock(&fs->lock);
+		get_fs_pwd(current->fs, &nd->path);
 	} else {
 		struct dentry *dentry;
 
@@ -1123,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, struct nameidata *nd)
 {
+	struct inode *inode = base->d_inode;
 	struct dentry *dentry;
-	struct inode *inode;
 	int err;
 
-	inode = base->d_inode;
+	err = exec_permission(inode);
+	if (err)
+		return ERR_PTR(err);
 
 	/*
 	 * See if the low-level filesystem might want
@@ -1140,35 +1140,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
 			goto out;
 	}
 
-	dentry = __d_lookup(base, name);
-
-	/* lockess __d_lookup may fail due to concurrent d_move()
-	 * in some unrelated directory, so try with d_lookup
+	/*
+	 * Don't bother with __d_lookup: callers are for creat as
+	 * well as unlink, so a lot of the time it would cost
+	 * a double lookup.
 	 */
-	if (!dentry)
-		dentry = d_lookup(base, name);
+	dentry = d_lookup(base, name);
 
 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
 		dentry = do_revalidate(dentry, nd);
 
-	if (!dentry) {
-		struct dentry *new;
-
-		/* Don't create child dentry for a dead directory. */
-		dentry = ERR_PTR(-ENOENT);
-		if (IS_DEADDIR(inode))
-			goto out;
-
-		new = d_alloc(base, name);
-		dentry = ERR_PTR(-ENOMEM);
-		if (!new)
-			goto out;
-		dentry = inode->i_op->lookup(inode, new, nd);
-		if (!dentry)
-			dentry = new;
-		else
-			dput(new);
-	}
+	if (!dentry)
+		dentry = d_alloc_and_lookup(base, name, nd);
 out:
 	return dentry;
 }
@@ -1180,11 +1163,6 @@ out:
  */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-	int err;
-
-	err = exec_permission(nd->path.dentry->d_inode);
-	if (err)
-		return ERR_PTR(err);
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
@@ -1232,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (err)
 		return ERR_PTR(err);
 
-	err = exec_permission(base->d_inode);
-	if (err)
-		return ERR_PTR(err);
 	return __lookup_hash(&this, base, NULL);
 }
 
@@ -1484,8 +1459,7 @@ static int handle_truncate(struct path *path)
 	 */
 	error = locks_verify_locked(inode);
 	if (!error)
-		error = security_path_truncate(path, 0,
-				       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+		error = security_path_truncate(path);
 	if (!error) {
 		error = do_truncate(path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1600,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
 	 */
 	if (will_truncate)
 		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
 	return filp;
 
 exit:
@@ -1701,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
+		path_put(&nd->path);
 		if (!IS_ERR(filp)) {
 			error = ima_file_check(filp, acc_mode);
 			if (error) {
@@ -2311,7 +2287,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 			goto slashes;
 		inode = dentry->d_inode;
 		if (inode)
-			atomic_inc(&inode->i_count);
+			ihold(inode);
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
@@ -2635,7 +2611,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	int error;
 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
-	const char *old_name;
+	const unsigned char *old_name;
 
 	if (old_dentry->d_inode == new_dentry->d_inode)
  		return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 88058de59c7..8a415c9c5e5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -29,6 +31,7 @@
 #include <linux/log2.h>
 #include <linux/idr.h>
 #include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -37,12 +40,10 @@
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
 
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 
@@ -54,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -64,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
 	int res;
 
 retry:
 	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-	spin_lock(&vfsmount_lock);
+	spin_lock(&mnt_id_lock);
 	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
 	if (!res)
 		mnt_id_start = mnt->mnt_id + 1;
-	spin_unlock(&vfsmount_lock);
+	spin_unlock(&mnt_id_lock);
 	if (res == -EAGAIN)
 		goto retry;
 
@@ -85,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
 	int id = mnt->mnt_id;
-	spin_lock(&vfsmount_lock);
+	spin_lock(&mnt_id_lock);
 	ida_remove(&mnt_id_ida, id);
 	if (mnt_id_start > id)
 		mnt_id_start = id;
-	spin_unlock(&vfsmount_lock);
+	spin_unlock(&mnt_id_lock);
 }
 
 /*
@@ -150,6 +164,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
+#ifdef CONFIG_FSNOTIFY
+		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+#endif
 #ifdef CONFIG_SMP
 		mnt->mnt_writers = alloc_percpu(int);
 		if (!mnt->mnt_writers)
@@ -344,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -378,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 	 */
 	smp_wmb();
 	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_flags &= ~MNT_READONLY;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -410,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
  * find the first or last mount at @dentry on vfsmount @mnt depending on
  * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
  */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 			      int dir)
@@ -439,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct vfsmount *child_mnt;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
 		mntget(child_mnt);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	return child_mnt;
 }
 
@@ -451,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns) {
@@ -459,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns && ns->event != event) {
@@ -467,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
 	old_path->dentry = mnt->mnt_mountpoint;
@@ -478,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 	old_path->dentry->d_mounted--;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 			struct vfsmount *child_mnt)
 {
@@ -486,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 	dentry->d_mounted++;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
 	mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -495,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
  */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -561,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 				goto out_free;
 		}
 
-		mnt->mnt_flags = old->mnt_flags;
+		mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
 		atomic_inc(&sb->s_active);
 		mnt->mnt_sb = sb;
 		mnt->mnt_root = dget(root);
@@ -610,6 +644,7 @@ static inline void __mntput(struct vfsmount *mnt)
 	 * provides barriers, so count_mnt_writers() below is safe.  AV
 	 */
 	WARN_ON(count_mnt_writers(mnt));
+	fsnotify_vfsmount_delete(mnt);
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
@@ -618,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-	if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
-		if (likely(!mnt->mnt_pinned)) {
-			spin_unlock(&vfsmount_lock);
-			__mntput(mnt);
-			return;
-		}
-		atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-		mnt->mnt_pinned = 0;
-		spin_unlock(&vfsmount_lock);
-		acct_auto_close_mnt(mnt);
-		goto repeat;
+	if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	if (!atomic_dec_and_test(&mnt->mnt_count)) {
+		br_write_unlock(vfsmount_lock);
+		return;
 	}
+	if (likely(!mnt->mnt_pinned)) {
+		br_write_unlock(vfsmount_lock);
+		__mntput(mnt);
+		return;
+	}
+	atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+	mnt->mnt_pinned = 0;
+	br_write_unlock(vfsmount_lock);
+	acct_auto_close_mnt(mnt);
+	goto repeat;
 }
-
 EXPORT_SYMBOL(mntput_no_expire);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_pinned++;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		atomic_inc(&mnt->mnt_count);
 		mnt->mnt_pinned--;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_unpin);
@@ -741,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
 	struct mnt_namespace *ns = p->ns;
 	int res = 0;
 
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	if (p->event != ns->event) {
 		p->event = ns->event;
 		res = 1;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 
 	return res;
 }
@@ -783,7 +822,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
-		{ MNT_STRICTATIME, ",strictatime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
@@ -948,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
 	int minimum_refs = 0;
 	struct vfsmount *p;
 
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += atomic_read(&p->mnt_count);
 		minimum_refs += 2;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -980,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	if (propagate_mount_busy(mnt, 2))
 		ret = 0;
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -999,13 +1037,14 @@ void release_mounts(struct list_head *head)
 		if (mnt->mnt_parent != mnt) {
 			struct dentry *dentry;
 			struct vfsmount *m;
-			spin_lock(&vfsmount_lock);
+
+			br_write_lock(vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			spin_unlock(&vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 			dput(dentry);
 			mntput(m);
 		}
@@ -1013,6 +1052,10 @@ void release_mounts(struct list_head *head)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
@@ -1103,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	}
 
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1115,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1227,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 			q = clone_mnt(p, p->mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			spin_lock(&vfsmount_lock);
+			br_write_lock(vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			spin_unlock(&vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1258,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	umount_tree(mnt, 0, &umount_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1388,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	if (err)
 		goto out_cleanup_ids;
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1407,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
+
 	return 0;
 
  out_cleanup_ids:
@@ -1440,13 +1484,30 @@ out_unlock:
 }
 
 /*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+	int type = flags & ~MS_REC;
+
+	/* Fail if any non-propagation flags are set */
+	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		return 0;
+	/* Only one propagation flag should be set */
+	if (!is_power_of_2(type))
+		return 0;
+	return type;
+}
+
+/*
  * recursively change the type of the mountpoint.
  */
 static int do_change_type(struct path *path, int flag)
 {
 	struct vfsmount *m, *mnt = path->mnt;
 	int recurse = flag & MS_REC;
-	int type = flag & ~MS_REC;
+	int type;
 	int err = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1455,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	type = flags_to_propagation_type(flag);
+	if (!type)
+		return -EINVAL;
+
 	down_write(&namespace_sem);
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
@@ -1462,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
 			goto out_unlock;
 	}
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1509,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
 	err = graft_tree(mnt, path);
 	if (err) {
 		LIST_HEAD(umount_list);
-		spin_lock(&vfsmount_lock);
+
+		br_write_lock(vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 
@@ -1564,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
 		path->mnt->mnt_flags = mnt_flags;
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		touch_mnt_namespace(path->mnt->mnt_ns);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 	}
 	return err;
 }
@@ -1678,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
 	mnt = do_kern_mount(type, flags, name, data);
-	unlock_kernel();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -1750,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		return;
 
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1769,7 +1833,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -1826,6 +1890,8 @@ resume:
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
  */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -1984,7 +2050,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
-	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 
@@ -2044,9 +2110,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2208,10 +2274,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out1;
 	}
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&current->fs->root);
-	read_unlock(&current->fs->lock);
+	get_fs_root(current->fs, &root);
 	down_write(&namespace_sem);
 	mutex_lock(&old.dentry->d_inode->i_mutex);
 	error = -EINVAL;
@@ -2243,7 +2306,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
@@ -2263,7 +2326,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* mount new_root on / */
 	attach_mnt(new.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 	path_put(&root_parent);
@@ -2278,7 +2341,7 @@ out1:
 out0:
 	return error;
 out3:
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	goto out2;
 }
 
@@ -2325,6 +2388,8 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
+	br_lock_init(vfsmount_lock);
+
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2343,9 +2408,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd5..aac8832e919 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
 };
 
 
+#define ncp_namespace(i)	(NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
+{
+#ifdef CONFIG_NCPFS_SMALLDOS
+	int ns = ncp_namespace(i);
+
+	if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+		|| ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+	   )
+		return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+	return 1;
+}
+
+#define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
+
+static inline int ncp_case_sensitive(struct dentry *dentry)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+	return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+#else
+	return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
+
 /*
  * Note: leave the hash unchanged if the directory
  * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
 static int 
 ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 {
-	struct nls_table *t;
-	unsigned long hash;
-	int i;
-
-	t = NCP_IO_TABLE(dentry);
+	if (!ncp_case_sensitive(dentry)) {
+		struct nls_table *t;
+		unsigned long hash;
+		int i;
 
-	if (!ncp_case_sensitive(dentry->d_inode)) {
+		t = NCP_IO_TABLE(dentry);
 		hash = init_name_hash();
 		for (i=0; i<this->len ; i++)
 			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	if (a->len != b->len)
 		return 1;
 
-	if (ncp_case_sensitive(dentry->d_inode))
+	if (ncp_case_sensitive(dentry))
 		return strncmp(a->name, b->name, a->len);
 
 	return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
 
 
 static int
-__ncp_lookup_validate(struct dentry *dentry)
+ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ncp_server *server;
 	struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
 
 	server = NCP_SERVER(dir);
 
-	if (!ncp_conn_valid(server))
-		goto finished;
-
 	/*
 	 * Inspired by smbfs:
 	 * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
 	if (ncp_is_server_root(dir)) {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, 1);
-		if (!res)
+		if (!res) {
 			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+		}
 	} else {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
 	 * what we remember, it's not valid any more.
 	 */
 	if (!res) {
-		if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) {
+		struct inode *inode = dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
 			ncp_new_dentry(dentry);
 			val=1;
 		} else
 			DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
 
-		ncp_update_inode2(dentry->d_inode, &finfo);
+		ncp_update_inode2(inode, &finfo);
+		mutex_unlock(&inode->i_mutex);
 	}
 
 finished:
@@ -335,16 +366,6 @@ finished:
 	return val;
 }
 
-static int
-ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-	int res;
-	lock_kernel();
-	res = __ncp_lookup_validate(dentry);
-	unlock_kernel();
-	return res;
-}
-
 static struct dentry *
 ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 {
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int result, mtime_valid = 0;
 	time_t mtime = 0;
 
-	lock_kernel();
-
 	ctl.page  = NULL;
 	ctl.cache = NULL;
 
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		(int) filp->f_pos);
 
 	result = -EIO;
+	/* Do not generate '.' and '..' when server is dead. */
 	if (!ncp_conn_valid(server))
 		goto out;
 
@@ -532,6 +552,12 @@ read_really:
 	ctl.head.end = ctl.fpos - 1;
 	ctl.head.eof = ctl.valid;
 finished:
+	if (ctl.page) {
+		kunmap(ctl.page);
+		SetPageUptodate(ctl.page);
+		unlock_page(ctl.page);
+		page_cache_release(ctl.page);
+	}
 	if (page) {
 		cache->head = ctl.head;
 		kunmap(page);
@@ -539,23 +565,17 @@ finished:
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	if (ctl.page) {
-		kunmap(ctl.page);
-		SetPageUptodate(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-	}
 out:
-	unlock_kernel();
 	return result;
 }
 
 static int
 ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry)
+		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+		int inval_childs)
 {
 	struct dentry *newdent, *dentry = filp->f_path.dentry;
-	struct inode *newino, *inode = dentry->d_inode;
+	struct inode *dir = dentry->d_inode;
 	struct ncp_cache_control ctl = *ctrl;
 	struct qstr qname;
 	int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
 	qname.len = sizeof(__name);
-	if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len,
+	if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
 			entry->i.entryName, entry->i.nameLen,
-			!ncp_preserve_entry_case(inode, entry->i.NSCreator)))
+			!ncp_preserve_entry_case(dir, entry->i.NSCreator)))
 		return 1; /* I'm not sure */
 
 	qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 			goto end_advance;
 	} else {
 		hashed = 1;
-		memcpy((char *) newdent->d_name.name, qname.name,
-							newdent->d_name.len);
+
+		/* If case sensitivity changed for this volume, all entries below this one
+		   should be thrown away.  This entry itself is not affected, as its case
+		   sensitivity is controlled by its own parent. */
+		if (inval_childs)
+			shrink_dcache_parent(newdent);
+
+		/*
+		 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+		 * case preserving yet case insensitive.  So we update dentry's name
+		 * as received from server.  We found dentry via d_lookup with our
+		 * hash, so we know that hash does not change, and so replacing name
+		 * should be reasonably safe.
+		 */
+		if (qname.len == newdent->d_name.len &&
+		    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
+			struct inode *inode = newdent->d_inode;
+
+			/*
+			 * Inside ncpfs all uses of d_name are either for debugging,
+			 * or on functions which acquire inode mutex (mknod, creat,
+			 * lookup).  So grab i_mutex here, to be sure.  d_path
+			 * uses dcache_lock when generating path, so we should too.
+			 * And finally d_compare is protected by dentry's d_lock, so
+			 * here we go.
+			 */
+			if (inode)
+				mutex_lock(&inode->i_mutex);
+			spin_lock(&dcache_lock);
+			spin_lock(&newdent->d_lock);
+			memcpy((char *) newdent->d_name.name, qname.name,
+								newdent->d_name.len);
+			spin_unlock(&newdent->d_lock);
+			spin_unlock(&dcache_lock);
+			if (inode)
+				mutex_unlock(&inode->i_mutex);
+		}
 	}
 
 	if (!newdent->d_inode) {
+		struct inode *inode;
+
 		entry->opened = 0;
-		entry->ino = iunique(inode->i_sb, 2);
-		newino = ncp_iget(inode->i_sb, entry);
-		if (newino) {
+		entry->ino = iunique(dir->i_sb, 2);
+		inode = ncp_iget(dir->i_sb, entry);
+		if (inode) {
 			newdent->d_op = &ncp_dentry_operations;
-			d_instantiate(newdent, newino);
+			d_instantiate(newdent, inode);
 			if (!hashed)
 				d_rehash(newdent);
 		}
-	} else
-		ncp_update_inode2(newdent->d_inode, entry);
+	} else {
+		struct inode *inode = newdent->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		ncp_update_inode2(inode, entry);
+		mutex_unlock(&inode->i_mutex);
+	}
 
 	if (newdent->d_inode) {
 		ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		ctl.cache = NULL;
 		ctl.idx  -= NCP_DIRCACHE_SIZE;
 		ctl.ofs  += 1;
-		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+		ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
 		if (ctl.page)
 			ctl.cache = kmap(ctl.page);
 	}
@@ -633,7 +695,7 @@ end_advance:
 		if (!ino)
 			ino = find_inode_number(dentry, &qname);
 		if (!ino)
-			ino = iunique(inode->i_sb, 2);
+			ino = iunique(dir->i_sb, 2);
 		ctl.filled = filldir(dirent, qname.name, qname.len,
 				     filp->f_pos, ino, DT_UNKNOWN);
 		if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 			(unsigned long) filp->f_pos);
 
 	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+		int inval_dentry;
 
 		if (ncp_get_volume_info_with_number(server, i, &info) != 0)
 			return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 				info.volume_name);
 			continue;
 		}
+		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
 		entry.volume = entry.i.volNumber;
-		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
 			return;
 	}
 }
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
 			rpl += onerpl;
 			rpls -= onerpl;
 			entry.volume = entry.i.volNumber;
-			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
 				break;
 		}
 	} while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
 		if (dent) {
 			struct inode* ino = dent->d_inode;
 			if (ino) {
+				ncp_update_known_namespace(server, volNumber, NULL);
 				NCP_FINFO(ino)->volNumber = volNumber;
 				NCP_FINFO(ino)->dirEntNum = dirEntNum;
 				NCP_FINFO(ino)->DosDirNum = DosDirNum;
+				result = 0;
 			} else {
 				DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
 			}
 		} else {
 			DPRINTK("ncpfs: sb->s_root == NULL!\n");
 		}
-	}
-	result = 0;
+	} else
+		result = 0;
 
 out:
 	return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 	int error, res, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
-	lock_kernel();
 	error = -EIO;
 	if (!ncp_conn_valid(server))
 		goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 				 dentry->d_name.len, 1);
 		if (!res)
 			res = ncp_lookup_volume(server, __name, &(finfo.i));
+			if (!res)
+				ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
 	} else {
 		res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 				 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
 
 finished:
 	PPRINTK("ncp_lookup: result=%d\n", error);
-	unlock_kernel();
 	return ERR_PTR(error);
 }
 
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 	PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name, mode);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 		if (result) {
 			if (result == 0x87)
 				error = -ENAMETOOLONG;
+			else if (result < 0)
+				error = result;
 			DPRINTK("ncp_create: %s/%s failed\n",
 				dentry->d_parent->d_name.name, dentry->d_name.name);
 			goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
 
 	error = ncp_instantiate(dir, dentry, &finfo);
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DPRINTK("ncp_mkdir: making %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		goto out;
 
-	error = -EACCES;
-	if (ncp_open_create_file_or_subdir(server, dir, __name,
+	error = ncp_open_create_file_or_subdir(server, dir, __name,
 					   OC_MODE_CREATE, aDIR,
 					   cpu_to_le16(0xffff),
-					   &finfo) == 0)
-	{
+					   &finfo);
+	if (error == 0) {
 		if (ncp_is_nfs_extras(server, finfo.volume)) {
 			mode |= S_IFDIR;
 			finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 				goto out;
 		}
 		error = ncp_instantiate(dir, dentry, &finfo);
+	} else if (error > 0) {
+		error = -EACCES;
 	}
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	error = -EBUSY;
 	if (!d_unhashed(dentry))
 		goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = result < 0 ? result : -EACCES;
 			break;
        	}
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 	struct ncp_server *server;
 	int error;
 
-	lock_kernel();
 	server = NCP_SERVER(dir);
 	DPRINTK("ncp_unlink: unlinking %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 	
-	error = -EIO;
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	/*
 	 * Check whether to close the file ...
 	 */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = error < 0 ? error : -EACCES;
 			break;
 	}
-		
-out:
-	unlock_kernel();
 	return error;
 }
 
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
-	error = -EIO;
-	lock_kernel();
-	if (!ncp_conn_valid(server))
-		goto out;
-
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
 
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 			error = -ENOENT;
 			break;
 		default:
-			error = -EACCES;
+			error = error < 0 ? error : -EACCES;
 			break;
 	}
 out:
-	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbda..6c754f70c52 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	DPRINTK("ncp_file_read: enter %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	if (!ncp_conn_valid(NCP_SERVER(inode)))
-		return -EIO;
-
 	pos = *ppos;
 
 	if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 
 	DPRINTK("ncp_file_write: enter %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
-	if (!ncp_conn_valid(NCP_SERVER(inode)))
-		return -EIO;
 	if ((ssize_t) count < 0)
 		return -EINVAL;
 	pos = *ppos;
 	if (file->f_flags & O_APPEND) {
-		pos = inode->i_size;
+		pos = i_size_read(inode);
 	}
 
 	if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 
 	*ppos = pos;
 
-	if (pos > inode->i_size) {
-		inode->i_size = pos;
+	if (pos > i_size_read(inode)) {
+		mutex_lock(&inode->i_mutex);
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		mutex_unlock(&inode->i_mutex);
 	}
 	DPRINTK("ncp_file_write: exit %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
-static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t ret;
-	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
-	unlock_kernel();
-	return ret;
-}
-
 const struct file_operations ncp_file_operations =
 {
-	.llseek 	= ncp_remote_llseek,
+	.llseek		= generic_file_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.unlocked_ioctl	= ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa338515402..d290545aa0c 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -43,7 +43,7 @@
 #define NCP_DEFAULT_TIME_OUT 10
 #define NCP_DEFAULT_RETRY_COUNT 20
 
-static void ncp_delete_inode(struct inode *);
+static void ncp_evict_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
 static int  ncp_statfs(struct dentry *, struct kstatfs *);
 static int  ncp_show_options(struct seq_file *, struct vfsmount *);
@@ -100,7 +100,7 @@ static const struct super_operations ncp_sops =
 	.alloc_inode	= ncp_alloc_inode,
 	.destroy_inode	= ncp_destroy_inode,
 	.drop_inode	= generic_delete_inode,
-	.delete_inode	= ncp_delete_inode,
+	.evict_inode	= ncp_evict_inode,
 	.put_super	= ncp_put_super,
 	.statfs		= ncp_statfs,
 	.remount_fs	= ncp_remount,
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
 		inode->i_mode = nwi->nfs.mode;
 	}
 
-	inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+	inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
 
 	inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
 	inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
 		inode->i_mode = server->m.dir_mode;
 		/* for directories dataStreamSize seems to be some
 		   Object ID ??? */
-		inode->i_size = NCP_BLOCK_SIZE;
+		i_size_write(inode, NCP_BLOCK_SIZE);
 	} else {
+		u32 size;
+
 		inode->i_mode = server->m.file_mode;
-		inode->i_size = le32_to_cpu(nwi->dataStreamSize);
+		size = le32_to_cpu(nwi->dataStreamSize);
+		i_size_write(inode, size);
 #ifdef CONFIG_NCPFS_EXTRAS
 		if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
 		 && (nwi->attributes & aSHARED)) {
 			switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
 				case aHIDDEN:
 					if (server->m.flags & NCP_MOUNT_SYMLINKS) {
-						if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE)
-						 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) {
+						if (/* (size >= NCP_MIN_SYMLINK_SIZE)
+						 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
 							inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
 							NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
 							break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
 }
 
 /*
- * Fill in the inode based on the ncp_entry_info structure.
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
  */
 static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 {
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 	if (inode) {
 		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
 
+		inode->i_mapping->backing_dev_info = sb->s_bdi;
 		inode->i_ino = info->ino;
 		ncp_set_attr(inode, info);
 		if (S_ISREG(inode->i_mode)) {
@@ -282,27 +286,29 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 }
 
 static void
-ncp_delete_inode(struct inode *inode)
+ncp_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 
 	if (S_ISDIR(inode->i_mode)) {
-		DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino);
+		DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
 	}
 
 	if (ncp_make_closed(inode) != 0) {
 		/* We can't do anything but complain. */
-		printk(KERN_ERR "ncp_delete_inode: could not close\n");
+		printk(KERN_ERR "ncp_evict_inode: could not close\n");
 	}
-	clear_inode(inode);
 }
 
 static void ncp_stop_tasks(struct ncp_server *server) {
 	struct sock* sk = server->ncp_sock->sk;
-		
+
+	lock_sock(sk);
 	sk->sk_error_report = server->error_report;
 	sk->sk_data_ready   = server->data_ready;
 	sk->sk_write_space  = server->write_space;
+	release_sock(sk);
 	del_timer_sync(&server->timeout_tm);
 	flush_scheduled_work();
 }
@@ -565,10 +571,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 /*	server->conn_status = 0;	*/
 /*	server->root_dentry = NULL;	*/
 /*	server->root_setuped = 0;	*/
+	mutex_init(&server->root_setup_lock);
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 /*	server->sign_wanted = 0;	*/
 /*	server->sign_active = 0;	*/
 #endif
+	init_rwsem(&server->auth_rwsem);
 	server->auth.auth_type = NCP_AUTH_NONE;
 /*	server->auth.object_name_len = 0;	*/
 /*	server->auth.object_name = NULL;	*/
@@ -593,16 +601,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server->nls_io = load_nls_default();
 #endif /* CONFIG_NCPFS_NLS */
 
-	server->dentry_ttl = 0;	/* no caching */
+	atomic_set(&server->dentry_ttl, 0);	/* no caching */
 
 	INIT_LIST_HEAD(&server->tx.requests);
 	mutex_init(&server->rcv.creq_mutex);
 	server->tx.creq		= NULL;
 	server->rcv.creq	= NULL;
-	server->data_ready	= sock->sk->sk_data_ready;
-	server->write_space	= sock->sk->sk_write_space;
-	server->error_report	= sock->sk->sk_error_report;
-	sock->sk->sk_user_data	= server;
 
 	init_timer(&server->timeout_tm);
 #undef NCP_PACKET_SIZE
@@ -619,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (server->rxbuf == NULL)
 		goto out_txbuf;
 
+	lock_sock(sock->sk);
+	server->data_ready	= sock->sk->sk_data_ready;
+	server->write_space	= sock->sk->sk_write_space;
+	server->error_report	= sock->sk->sk_error_report;
+	sock->sk->sk_user_data	= server;
 	sock->sk->sk_data_ready	  = ncp_tcp_data_ready;
 	sock->sk->sk_error_report = ncp_tcp_error_report;
 	if (sock->type == SOCK_STREAM) {
@@ -634,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 		server->timeout_tm.data = (unsigned long)server;
 		server->timeout_tm.function = ncpdgram_timeout_call;
 	}
+	release_sock(sock->sk);
 
 	ncp_lock_server(server);
 	error = ncp_connect(server);
@@ -658,8 +668,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				goto out_disconnect;
 			}
 		}
+		ncp_lock_server(server);
 		if (options & 2)
 			server->sign_wanted = 1;
+		ncp_unlock_server(server);
 	}
 	else 
 #endif	/* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +732,9 @@ out_nls:
 	unload_nls(server->nls_io);
 	unload_nls(server->nls_vol);
 #endif
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
 out_fput2:
 	if (server->info_filp)
 		fput(server->info_filp);
@@ -728,8 +743,8 @@ out_fput:
 out_bdi:
 	/* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
 	 * 
-	 * The previously used put_filp(ncp_filp); was bogous, since
-	 * it doesn't proper unlocking.
+	 * The previously used put_filp(ncp_filp); was bogus, since
+	 * it doesn't perform proper unlocking.
 	 */
 	fput(ncp_filp);
 out:
@@ -743,8 +758,6 @@ static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
 
-	lock_kernel();
-
 	ncp_lock_server(server);
 	ncp_disconnect(server);
 	ncp_unlock_server(server);
@@ -756,6 +769,9 @@ static void ncp_put_super(struct super_block *sb)
 	unload_nls(server->nls_vol);
 	unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
+	mutex_destroy(&server->rcv.creq_mutex);
+	mutex_destroy(&server->root_setup_lock);
+	mutex_destroy(&server->mutex);
 
 	if (server->info_filp)
 		fput(server->info_filp);
@@ -771,8 +787,6 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
-
-	unlock_kernel();
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +865,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	result = -EIO;
 
-	lock_kernel();	
-
 	server = NCP_SERVER(inode);
-	if ((!server) || !ncp_conn_valid(server))
+	if (!server)	/* How this could happen? */
 		goto out;
 
 	/* ageing the dentry to force validation */
@@ -924,9 +936,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 				tmpattr.ia_valid = ATTR_MODE;
 				tmpattr.ia_mode = attr->ia_mode;
 
-				result = inode_setattr(inode, &tmpattr);
-				if (result)
-					goto out;
+				setattr_copy(inode, &tmpattr);
+				mark_inode_dirty(inode);
 			}
 		}
 #endif
@@ -954,15 +965,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 		result = ncp_make_closed(inode);
 		if (result)
 			goto out;
-		{
-			struct iattr tmpattr;
-			
-			tmpattr.ia_valid = ATTR_SIZE;
-			tmpattr.ia_size = attr->ia_size;
-			
-			result = inode_setattr(inode, &tmpattr);
+
+		if (attr->ia_size != i_size_read(inode)) {
+			result = vmtruncate(inode, attr->ia_size);
 			if (result)
 				goto out;
+			mark_inode_dirty(inode);
 		}
 	}
 	if ((attr->ia_valid & ATTR_CTIME) != 0) {
@@ -985,8 +993,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 		result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
 				      inode, info_mask, &info);
 		if (result != 0) {
-			result = -EACCES;
-
 			if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
 				/* NetWare seems not to allow this. I
 				   do not know why. So, just tell the
@@ -1002,23 +1008,28 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 			NCP_FINFO(inode)->nwattr = info.attributes;
 #endif
 	}
-	if (!result)
-		result = inode_setattr(inode, attr);
+	if (result)
+		goto out;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
 out:
-	unlock_kernel();
+	if (result > 0)
+		result = -EACCES;
 	return result;
 }
 
-static int ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, ncp_fill_super);
 }
 
 static struct file_system_type ncp_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ncpfs",
-	.get_sb		= ncp_get_sb,
+	.mount		= ncp_mount,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 023c03d0207..c2a1f9a155c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,7 +20,6 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 
@@ -36,16 +35,11 @@
 #define NCP_PACKET_SIZE_INTERNAL 65536
 
 static int
-ncp_get_fs_info(struct ncp_server * server, struct file *file,
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
 		struct ncp_fs_info __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ncp_fs_info info;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info, arg, sizeof(info)))
 		return -EFAULT;
 
@@ -66,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
 }
 
 static int
-ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		   struct ncp_fs_info_v2 __user * arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ncp_fs_info_v2 info2;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info2, arg, sizeof(info2)))
 		return -EFAULT;
 
@@ -137,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
 #define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct compat_ncp_privatedata_ioctl)
 
 static int
-ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		   struct compat_ncp_fs_info_v2 __user * arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
 	struct compat_ncp_fs_info_v2 info2;
 
-	if (file_permission(file, MAY_WRITE) != 0
-	    && current_uid() != server->m.mounted_uid)
-		return -EACCES;
-
 	if (copy_from_user(&info2, arg, sizeof(info2)))
 		return -EFAULT;
 
@@ -183,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	struct nls_table *iocharset;
 	struct nls_table *oldset_io;
 	struct nls_table *oldset_cp;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (server->root_setuped)
-		return -EBUSY;
+	int utf8;
+	int err;
 
 	if (copy_from_user(&user, arg, sizeof(user)))
 		return -EFAULT;
@@ -207,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	user.iocharset[NCP_IOCSNAME_LEN] = 0;
 	if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
 		iocharset = load_nls_default();
-		NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 0;
 	} else if (!strcmp(user.iocharset, "utf8")) {
 		iocharset = load_nls_default();
-		NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 1;
 	} else {
 		iocharset = load_nls(user.iocharset);
 		if (!iocharset) {
 			unload_nls(codepage);
 			return -EBADRQC;
 		}
-		NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		utf8 = 0;
 	}
 
-	oldset_cp = server->nls_vol;
-	server->nls_vol = codepage;
-	oldset_io = server->nls_io;
-	server->nls_io = iocharset;
-
+	mutex_lock(&server->root_setup_lock);
+	if (server->root_setuped) {
+		oldset_cp = codepage;
+		oldset_io = iocharset;
+		err = -EBUSY;
+	} else {
+		if (utf8)
+			NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+		else
+			NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+		oldset_cp = server->nls_vol;
+		server->nls_vol = codepage;
+		oldset_io = server->nls_io;
+		server->nls_io = iocharset;
+		err = 0;
+	}
+	mutex_unlock(&server->root_setup_lock);
 	unload_nls(oldset_cp);
 	unload_nls(oldset_io);
 
-	return 0;
+	return err;
 }
 
 static int
@@ -238,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 	int len;
 
 	memset(&user, 0, sizeof(user));
+	mutex_lock(&server->root_setup_lock);
 	if (server->nls_vol && server->nls_vol->charset) {
 		len = strlen(server->nls_vol->charset);
 		if (len > NCP_IOCSNAME_LEN)
@@ -255,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 		strncpy(user.iocharset,	server->nls_io->charset, len);
 		user.iocharset[len] = 0;
 	}
+	mutex_unlock(&server->root_setup_lock);
 
 	if (copy_to_user(arg, &user, sizeof(user)))
 		return -EFAULT;
@@ -262,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	int result;
 	struct ncp_ioctl_request request;
 	char* bouncebuffer;
 	void __user *argp = (void __user *)arg;
-	uid_t uid = current_uid();
 
 	switch (cmd) {
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_NCPREQUEST_32:
 #endif
 	case NCP_IOC_NCPREQUEST:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
 #ifdef CONFIG_COMPAT
 		if (cmd == NCP_IOC_NCPREQUEST_32) {
 			struct compat_ncp_ioctl_request request32;
@@ -315,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		server->current_size = request.size;
 		memcpy(server->packet, bouncebuffer, request.size);
 
-		result = ncp_request2(server, request.function, 
+		result = ncp_request2(server, request.function,
 			bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
 		if (result < 0)
 			result = -EIO;
@@ -332,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	case NCP_IOC_CONN_LOGGED_IN:
 
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
 		if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
 			return -EINVAL;
+		mutex_lock(&server->root_setup_lock);
 		if (server->root_setuped)
-			return -EBUSY;
-		server->root_setuped = 1;
-		return ncp_conn_logged_in(inode->i_sb);
+			result = -EBUSY;
+		else {
+			result = ncp_conn_logged_in(inode->i_sb);
+			if (result == 0)
+				server->root_setuped = 1;
+		}
+		mutex_unlock(&server->root_setup_lock);
+		return result;
 
 	case NCP_IOC_GET_FS_INFO:
-		return ncp_get_fs_info(server, filp, argp);
+		return ncp_get_fs_info(server, inode, argp);
 
 	case NCP_IOC_GET_FS_INFO_V2:
-		return ncp_get_fs_info_v2(server, filp, argp);
+		return ncp_get_fs_info_v2(server, inode, argp);
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_GET_FS_INFO_V2_32:
-		return ncp_get_compat_fs_info_v2(server, filp, argp);
+		return ncp_get_compat_fs_info_v2(server, inode, argp);
 #endif
 	/* we have too many combinations of CONFIG_COMPAT,
 	 * CONFIG_64BIT and CONFIG_UID16, so just handle
 	 * any of the possible ioctls */
 	case NCP_IOC_GETMOUNTUID16:
-	case NCP_IOC_GETMOUNTUID32:
-	case NCP_IOC_GETMOUNTUID64:
-		if (file_permission(filp, MAY_READ) != 0
-			&& uid != server->m.mounted_uid)
-			return -EACCES;
-
-		if (cmd == NCP_IOC_GETMOUNTUID16) {
+		{
 			u16 uid;
+
 			SET_UID(uid, server->m.mounted_uid);
 			if (put_user(uid, (u16 __user *)argp))
 				return -EFAULT;
-		} else if (cmd == NCP_IOC_GETMOUNTUID32) {
-			if (put_user(server->m.mounted_uid,
-						(u32 __user *)argp))
-				return -EFAULT;
-		} else {
-			if (put_user(server->m.mounted_uid,
-						(u64 __user *)argp))
-				return -EFAULT;
+			return 0;
 		}
+	case NCP_IOC_GETMOUNTUID32:
+		if (put_user(server->m.mounted_uid,
+			     (u32 __user *)argp))
+			return -EFAULT;
+		return 0;
+	case NCP_IOC_GETMOUNTUID64:
+		if (put_user(server->m.mounted_uid,
+			     (u64 __user *)argp))
+			return -EFAULT;
 		return 0;
 
 	case NCP_IOC_GETROOT:
 		{
 			struct ncp_setroot_ioctl sr;
 
-			if (file_permission(filp, MAY_READ) != 0
-			    && uid != server->m.mounted_uid)
-				return -EACCES;
-
+			result = -EACCES;
+			mutex_lock(&server->root_setup_lock);
 			if (server->m.mounted_vol[0]) {
 				struct dentry* dentry = inode->i_sb->s_root;
 
 				if (dentry) {
 					struct inode* s_inode = dentry->d_inode;
-				
+
 					if (s_inode) {
 						sr.volNumber = NCP_FINFO(s_inode)->volNumber;
 						sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
 						sr.namespace = server->name_space[sr.volNumber];
+						result = 0;
 					} else
 						DPRINTK("ncpfs: s_root->d_inode==NULL\n");
 				} else
@@ -403,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				sr.volNumber = -1;
 				sr.namespace = 0;
 				sr.dirEntNum = 0;
+				result = 0;
 			}
-			if (copy_to_user(argp, &sr, sizeof(sr)))
-				return -EFAULT;
-			return 0;
+			mutex_unlock(&server->root_setup_lock);
+			if (!result && copy_to_user(argp, &sr, sizeof(sr)))
+				result = -EFAULT;
+			return result;
 		}
 
 	case NCP_IOC_SETROOT:
@@ -417,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			__le32 dosde;
 			struct dentry* dentry;
 
-			if (!capable(CAP_SYS_ADMIN))
-			{
-				return -EACCES;
-			}
-			if (server->root_setuped) return -EBUSY;
 			if (copy_from_user(&sr, argp, sizeof(sr)))
 				return -EFAULT;
-			if (sr.volNumber < 0) {
-				server->m.mounted_vol[0] = 0;
-				vnum = NCP_NUMBER_OF_VOLUMES;
-				de = 0;
-				dosde = 0;
-			} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
-				return -EINVAL;
-			} else if (ncp_mount_subdir(server, sr.volNumber,
-						sr.namespace, sr.dirEntNum,
-						&vnum, &de, &dosde)) {
-				return -ENOENT;
-			}
-			
-			dentry = inode->i_sb->s_root;
-			server->root_setuped = 1;
-			if (dentry) {
-				struct inode* s_inode = dentry->d_inode;
-				
-				if (s_inode) {
-					NCP_FINFO(s_inode)->volNumber = vnum;
-					NCP_FINFO(s_inode)->dirEntNum = de;
-					NCP_FINFO(s_inode)->DosDirNum = dosde;
+			mutex_lock(&server->root_setup_lock);
+			if (server->root_setuped)
+				result = -EBUSY;
+			else {
+				if (sr.volNumber < 0) {
+					server->m.mounted_vol[0] = 0;
+					vnum = NCP_NUMBER_OF_VOLUMES;
+					de = 0;
+					dosde = 0;
+					result = 0;
+				} else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+					result = -EINVAL;
+				} else if (ncp_mount_subdir(server, sr.volNumber,
+							sr.namespace, sr.dirEntNum,
+							&vnum, &de, &dosde)) {
+					result = -ENOENT;
 				} else
-					DPRINTK("ncpfs: s_root->d_inode==NULL\n");
-			} else
-				DPRINTK("ncpfs: s_root==NULL\n");
+					result = 0;
+
+				if (result == 0) {
+					dentry = inode->i_sb->s_root;
+					if (dentry) {
+						struct inode* s_inode = dentry->d_inode;
+
+						if (s_inode) {
+							NCP_FINFO(s_inode)->volNumber = vnum;
+							NCP_FINFO(s_inode)->dirEntNum = de;
+							NCP_FINFO(s_inode)->DosDirNum = dosde;
+							server->root_setuped = 1;
+						} else {
+							DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+							result = -EIO;
+						}
+					} else {
+						DPRINTK("ncpfs: s_root==NULL\n");
+						result = -EIO;
+					}
+				}
+				result = 0;
+			}
+			mutex_unlock(&server->root_setup_lock);
 
-			return 0;
+			return result;
 		}
 
-#ifdef CONFIG_NCPFS_PACKET_SIGNING	
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
 	case NCP_IOC_SIGN_INIT:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
-		if (argp) {
-			if (server->sign_wanted)
-			{
-				struct ncp_sign_init sign;
+		{
+			struct ncp_sign_init sign;
 
+			if (argp)
 				if (copy_from_user(&sign, argp, sizeof(sign)))
 					return -EFAULT;
-				memcpy(server->sign_root,sign.sign_root,8);
-				memcpy(server->sign_last,sign.sign_last,16);
-				server->sign_active = 1;
+			ncp_lock_server(server);
+			mutex_lock(&server->rcv.creq_mutex);
+			if (argp) {
+				if (server->sign_wanted) {
+					memcpy(server->sign_root,sign.sign_root,8);
+					memcpy(server->sign_last,sign.sign_last,16);
+					server->sign_active = 1;
+				}
+				/* ignore when signatures not wanted */
+			} else {
+				server->sign_active = 0;
 			}
-			/* ignore when signatures not wanted */
-		} else {
-			server->sign_active = 0;
+			mutex_unlock(&server->rcv.creq_mutex);
+			ncp_unlock_server(server);
+			return 0;
 		}
-		return 0;		
-		
+
         case NCP_IOC_SIGN_WANTED:
-		if (file_permission(filp, MAY_READ) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-		
-                if (put_user(server->sign_wanted, (int __user *)argp))
-			return -EFAULT;
-                return 0;
+		{
+			int state;
+
+			ncp_lock_server(server);
+			state = server->sign_wanted;
+			ncp_unlock_server(server);
+			if (put_user(state, (int __user *)argp))
+				return -EFAULT;
+			return 0;
+		}
 
 	case NCP_IOC_SET_SIGN_WANTED:
 		{
 			int newstate;
 
-			if (file_permission(filp, MAY_WRITE) != 0
-			    && uid != server->m.mounted_uid)
-				return -EACCES;
-
 			/* get only low 8 bits... */
 			if (get_user(newstate, (unsigned char __user *)argp))
 				return -EFAULT;
+			result = 0;
+			ncp_lock_server(server);
 			if (server->sign_active) {
 				/* cannot turn signatures OFF when active */
-				if (!newstate) return -EINVAL;
+				if (!newstate)
+					result = -EINVAL;
 			} else {
 				server->sign_wanted = newstate != 0;
 			}
-			return 0;
+			ncp_unlock_server(server);
+			return result;
 		}
 
 #endif /* CONFIG_NCPFS_PACKET_SIGNING */
 
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
 	case NCP_IOC_LOCKUNLOCK:
-		if (file_permission(filp, MAY_WRITE) != 0
-		    && uid != server->m.mounted_uid)
-			return -EACCES;
-
 		{
 			struct ncp_lock_ioctl	 rqdata;
 
@@ -542,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			{
 				return result;
 			}
-			result = -EIO;
-			if (!ncp_conn_valid(server))
-				goto outrel;
 			result = -EISDIR;
 			if (!S_ISREG(inode->i_mode))
 				goto outrel;
 			if (rqdata.cmd == NCP_LOCK_CLEAR)
 			{
 				result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
-							NCP_FINFO(inode)->file_handle, 
+							NCP_FINFO(inode)->file_handle,
 							rqdata.offset,
 							rqdata.length);
 				if (result > 0) result = 0;	/* no such lock */
@@ -574,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 							rqdata.timeout);
 				if (result > 0) result = -EAGAIN;
 			}
-outrel:			
+outrel:
 			ncp_inode_close(inode);
 			return result;
 		}
@@ -582,60 +581,62 @@ outrel:
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_GETOBJECTNAME_32:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct compat_ncp_objectname_ioctl user;
 			size_t outl;
 
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+			down_read(&server->auth_rwsem);
 			user.auth_type = server->auth.auth_type;
 			outl = user.object_name_len;
 			user.object_name_len = server->auth.object_name_len;
 			if (outl > user.object_name_len)
 				outl = user.object_name_len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(compat_ptr(user.object_name),
 						 server->auth.object_name,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
-			if (copy_to_user(argp, &user, sizeof(user)))
-				return -EFAULT;
-			return 0;
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
 		}
 #endif
 
 	case NCP_IOC_GETOBJECTNAME:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_objectname_ioctl user;
 			size_t outl;
 
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+			down_read(&server->auth_rwsem);
 			user.auth_type = server->auth.auth_type;
 			outl = user.object_name_len;
 			user.object_name_len = server->auth.object_name_len;
 			if (outl > user.object_name_len)
 				outl = user.object_name_len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(user.object_name,
 						 server->auth.object_name,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
-			if (copy_to_user(argp, &user, sizeof(user)))
-				return -EFAULT;
-			return 0;
+			up_read(&server->auth_rwsem);
+			if (!result && copy_to_user(argp, &user, sizeof(user)))
+				result = -EFAULT;
+			return result;
 		}
 
 #ifdef CONFIG_COMPAT
 	case NCP_IOC_SETOBJECTNAME_32:
 #endif
 	case NCP_IOC_SETOBJECTNAME:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_objectname_ioctl user;
 			void* newname;
@@ -667,9 +668,7 @@ outrel:
 			} else {
 				newname = NULL;
 			}
-			/* enter critical section */
-			/* maybe that kfree can sleep so do that this way */
-			/* it is at least more SMP friendly (in future...) */
+			down_write(&server->auth_rwsem);
 			oldname = server->auth.object_name;
 			oldnamelen = server->auth.object_name_len;
 			oldprivate = server->priv.data;
@@ -679,7 +678,7 @@ outrel:
 			server->auth.object_name = newname;
 			server->priv.len = 0;
 			server->priv.data = NULL;
-			/* leave critical section */
+			up_write(&server->auth_rwsem);
 			kfree(oldprivate);
 			kfree(oldname);
 			return 0;
@@ -689,8 +688,6 @@ outrel:
 	case NCP_IOC_GETPRIVATEDATA_32:
 #endif
 	case NCP_IOC_GETPRIVATEDATA:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_privatedata_ioctl user;
 			size_t outl;
@@ -707,14 +704,20 @@ outrel:
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
 
+			down_read(&server->auth_rwsem);
 			outl = user.len;
 			user.len = server->priv.len;
 			if (outl > user.len) outl = user.len;
+			result = 0;
 			if (outl) {
 				if (copy_to_user(user.data,
 						 server->priv.data,
-						 outl)) return -EFAULT;
+						 outl))
+					result = -EFAULT;
 			}
+			up_read(&server->auth_rwsem);
+			if (result)
+				return result;
 #ifdef CONFIG_COMPAT
 			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
 				struct compat_ncp_privatedata_ioctl user32;
@@ -734,8 +737,6 @@ outrel:
 	case NCP_IOC_SETPRIVATEDATA_32:
 #endif
 	case NCP_IOC_SETPRIVATEDATA:
-		if (uid != server->m.mounted_uid)
-			return -EACCES;
 		{
 			struct ncp_privatedata_ioctl user;
 			void* new;
@@ -763,12 +764,12 @@ outrel:
 			} else {
 				new = NULL;
 			}
-			/* enter critical section */
+			down_write(&server->auth_rwsem);
 			old = server->priv.data;
 			oldlen = server->priv.len;
 			server->priv.len = user.len;
 			server->priv.data = new;
-			/* leave critical section */
+			up_write(&server->auth_rwsem);
 			kfree(old);
 			return 0;
 		}
@@ -776,17 +777,13 @@ outrel:
 #ifdef CONFIG_NCPFS_NLS
 	case NCP_IOC_SETCHARSETS:
 		return ncp_set_charsets(server, argp);
-		
+
 	case NCP_IOC_GETCHARSETS:
 		return ncp_get_charsets(server, argp);
 
 #endif /* CONFIG_NCPFS_NLS */
 
 	case NCP_IOC_SETDENTRYTTL:
-		if (file_permission(filp, MAY_WRITE) != 0 &&
-		    uid != server->m.mounted_uid)
-			return -EACCES;
-
 		{
 			u_int32_t user;
 
@@ -796,13 +793,13 @@ outrel:
 			if (user > 20000)
 				return -EINVAL;
 			user = (user * HZ) / 1000;
-			server->dentry_ttl = user;
+			atomic_set(&server->dentry_ttl, user);
 			return 0;
 		}
-		
+
 	case NCP_IOC_GETDENTRYTTL:
 		{
-			u_int32_t user = (server->dentry_ttl * 1000) / HZ;
+			u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
 			if (copy_to_user(argp, &user, sizeof(user)))
 				return -EFAULT;
 			return 0;
@@ -812,59 +809,103 @@ outrel:
 	return -EINVAL;
 }
 
-static int ncp_ioctl_need_write(unsigned int cmd)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ncp_server *server = NCP_SERVER(inode);
+	uid_t uid = current_uid();
+	int need_drop_write = 0;
+	long ret;
+
 	switch (cmd) {
-	case NCP_IOC_GET_FS_INFO:
-	case NCP_IOC_GET_FS_INFO_V2:
-	case NCP_IOC_NCPREQUEST:
-	case NCP_IOC_SETDENTRYTTL:
-	case NCP_IOC_SIGN_INIT:
-	case NCP_IOC_LOCKUNLOCK:
-	case NCP_IOC_SET_SIGN_WANTED:
-		return 1;
-	case NCP_IOC_GETOBJECTNAME:
-	case NCP_IOC_SETOBJECTNAME:
-	case NCP_IOC_GETPRIVATEDATA:
-	case NCP_IOC_SETPRIVATEDATA:
 	case NCP_IOC_SETCHARSETS:
-	case NCP_IOC_GETCHARSETS:
 	case NCP_IOC_CONN_LOGGED_IN:
-	case NCP_IOC_GETDENTRYTTL:
-	case NCP_IOC_GETMOUNTUID2:
-	case NCP_IOC_SIGN_WANTED:
-	case NCP_IOC_GETROOT:
 	case NCP_IOC_SETROOT:
-		return 0;
-	default:
-		/* unknown IOCTL command, assume write */
-		return 1;
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EACCES;
+			goto out;
+		}
+		break;
 	}
-}
-
-long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	long ret;
-
-	lock_kernel();
-	if (ncp_ioctl_need_write(cmd)) {
+	if (server->m.mounted_uid != uid) {
+		switch (cmd) {
 		/*
-		 * inside the ioctl(), any failures which
-		 * are because of file_permission() are
-		 * -EACCESS, so it seems consistent to keep
-		 *  that here.
+		 * Only mount owner can issue these ioctls.  Information
+		 * necessary to authenticate to other NDS servers are
+		 * stored here.
 		 */
-		if (mnt_want_write(filp->f_path.mnt)) {
+		case NCP_IOC_GETOBJECTNAME:
+		case NCP_IOC_SETOBJECTNAME:
+		case NCP_IOC_GETPRIVATEDATA:
+		case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GETOBJECTNAME_32:
+		case NCP_IOC_SETOBJECTNAME_32:
+		case NCP_IOC_GETPRIVATEDATA_32:
+		case NCP_IOC_SETPRIVATEDATA_32:
+#endif
 			ret = -EACCES;
 			goto out;
+		/*
+		 * These require write access on the inode if user id
+		 * does not match.  Note that they do not write to the
+		 * file...  But old code did mnt_want_write, so I keep
+		 * it as is.  Of course not for mountpoint owner, as
+		 * that breaks read-only mounts altogether as ncpmount
+		 * needs working NCP_IOC_NCPREQUEST and
+		 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+		 * signinit, setsignwanted) should be probably restricted
+		 * to owner only, or even more to CAP_SYS_ADMIN).
+		 */
+		case NCP_IOC_GET_FS_INFO:
+		case NCP_IOC_GET_FS_INFO_V2:
+		case NCP_IOC_NCPREQUEST:
+		case NCP_IOC_SETDENTRYTTL:
+		case NCP_IOC_SIGN_INIT:
+		case NCP_IOC_LOCKUNLOCK:
+		case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+		case NCP_IOC_GET_FS_INFO_V2_32:
+		case NCP_IOC_NCPREQUEST_32:
+#endif
+			ret = mnt_want_write_file(filp);
+			if (ret)
+				goto out;
+			need_drop_write = 1;
+			ret = inode_permission(inode, MAY_WRITE);
+			if (ret)
+				goto outDropWrite;
+			break;
+		/*
+		 * Read access required.
+		 */
+		case NCP_IOC_GETMOUNTUID16:
+		case NCP_IOC_GETMOUNTUID32:
+		case NCP_IOC_GETMOUNTUID64:
+		case NCP_IOC_GETROOT:
+		case NCP_IOC_SIGN_WANTED:
+			ret = inode_permission(inode, MAY_READ);
+			if (ret)
+				goto out;
+			break;
+		/*
+		 * Anybody can read these.
+		 */
+		case NCP_IOC_GETCHARSETS:
+		case NCP_IOC_GETDENTRYTTL:
+		default:
+		/* Three codes below are protected by CAP_SYS_ADMIN above. */
+		case NCP_IOC_SETCHARSETS:
+		case NCP_IOC_CONN_LOGGED_IN:
+		case NCP_IOC_SETROOT:
+			break;
 		}
 	}
-	ret = __ncp_ioctl(filp, cmd, arg);
-	if (ncp_ioctl_need_write(cmd))
+	ret = __ncp_ioctl(inode, cmd, arg);
+outDropWrite:
+	if (need_drop_write)
 		mnt_drop_write(filp->f_path.mnt);
-
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -873,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	long ret;
 
-	lock_kernel();
 	arg = (unsigned long) compat_ptr(arg);
 	ret = ncp_ioctl(file, cmd, arg);
-	unlock_kernel();
 	return ret;
 }
 #endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a597..a95615a0b6a 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
 	return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
 }
 
-static inline u8 BVAL(void *data)
+static inline u8 BVAL(const void *data)
 {
-	return *(u8 *)data;
+	return *(const u8 *)data;
 }
 
 static u8 ncp_reply_byte(struct ncp_server *server, int offset)
 {
-	return *(u8 *)ncp_reply_data(server, offset);
+	return *(const u8 *)ncp_reply_data(server, offset);
 }
 
-static inline u16 WVAL_LH(void *data)
+static inline u16 WVAL_LH(const void *data)
 {
 	return get_unaligned_le16(data);
 }
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
 	return get_unaligned_be16(ncp_reply_data(server, offset));
 }
 
-static inline u32 DVAL_LH(void *data)
+static inline u32 DVAL_LH(const void *data)
 {
 	return get_unaligned_le32(data);
 }
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
 	return result;
 }
 
-void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
 {
-	__u8 *name_len;
+	const __u8 *name_len;
 	const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
 
 	memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
 }
 
 #ifdef CONFIG_NCPFS_NFS_NS
-static inline void ncp_extract_nfs_info(unsigned char *structure,
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
 				 struct nw_nfs_info *target)
 {
 	target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
  * Returns information for a (one-component) name relative to
  * the specified directory.
  */
-int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path,
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
 			struct nw_info_struct *target)
 {
 	__u8  volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
 #ifdef CONFIG_NCPFS_NFS_NS
 static int
 ncp_obtain_DOS_dir_base(struct ncp_server *server,
-		__u8 volnum, __le32 dirent,
-		char *path, /* At most 1 component */
+		__u8 ns, __u8 volnum, __le32 dirent,
+		const char *path, /* At most 1 component */
 		__le32 *DOS_dir_base)
 {
 	int result;
 
 	ncp_init_request(server);
 	ncp_add_byte(server, 6); /* subfunction */
-	ncp_add_byte(server, server->name_space[volnum]);
-	ncp_add_byte(server, server->name_space[volnum]);
+	ncp_add_byte(server, ns);
+	ncp_add_byte(server, ns);
 	ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
 	ncp_add_dword(server, RIM_DIRECTORY);
 	ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
 #endif	/* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
 }
 
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+	int ns = ncp_get_known_namespace(server, volume);
+
+	if (ret_ns)
+		*ret_ns = ns;
+
+	DPRINTK("lookup_vol: namespace[%d] = %d\n",
+		volume, server->name_space[volume]);
+
+	if (server->name_space[volume] == ns)
+		return 0;
+	server->name_space[volume] = ns;
+	return 1;
+}
+
 static int
 ncp_ObtainSpecificDirBase(struct ncp_server *server,
 		__u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
-		char *path, /* At most 1 component */
+		const char *path, /* At most 1 component */
 		__le32 *dirEntNum, __le32 *DosDirNum)
 {
 	int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
 {
 	int dstNS;
 	int result;
-	
-	dstNS = ncp_get_known_namespace(server, volNumber);
+
+	ncp_update_known_namespace(server, volNumber, &dstNS);
 	if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
 				      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
 	{
 		return result;
 	}
-	server->name_space[volNumber] = dstNS;
 	*volume = volNumber;
 	server->m.mounted_vol[1] = 0;
 	server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
 }
 
 int 
-ncp_get_volume_root(struct ncp_server *server, const char *volname,
-		    __u32* volume, __le32* dirent, __le32* dosdirent)
+ncp_get_volume_root(struct ncp_server *server,
+		    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
 {
 	int result;
-	__u8 volnum;
 
 	DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
 
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
 		return result;
 	}
 	*dirent = *dosdirent = ncp_reply_dword(server, 4);
-	volnum = ncp_reply_byte(server, 8);
+	*volume = ncp_reply_byte(server, 8);
 	ncp_unlock_server(server);
-	*volume = volnum;
-
-	server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
-
-	DPRINTK("lookup_vol: namespace[%d] = %d\n",
-		volnum, server->name_space[volnum]);
-
 	return 0;
 }
 
 int
-ncp_lookup_volume(struct ncp_server *server, const char *volname,
-		  struct nw_info_struct *target)
+ncp_lookup_volume(struct ncp_server *server,
+		  const char *volname, struct nw_info_struct *target)
 {
 	int result;
 
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
 	if (result) {
 		return result;
 	}
+	ncp_update_known_namespace(server, target->volNumber, NULL);
 	target->nameLen = strlen(volname);
 	memcpy(target->entryName, volname, target->nameLen+1);
 	target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 {
 	int result = 0;
 
+	ncp_init_request(server);
 	if (server->name_space[volnum] == NW_NS_NFS) {
-		ncp_init_request(server);
 		ncp_add_byte(server, 25);	/* subfunction */
 		ncp_add_byte(server, server->name_space[volnum]);
 		ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 		ncp_add_dword_lh(server, 1);	/* nlinks */
 		ncp_add_dword_lh(server, rdev);
 		result = ncp_request(server, 87);
-		ncp_unlock_server(server);
 	}
+	ncp_unlock_server(server);
 	return result;
 }
 #endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 static int
 ncp_DeleteNSEntry(struct ncp_server *server,
 		  __u8 have_dir_base, __u8 volnum, __le32 dirent,
-		  char* name, __u8 ns, __le16 attr)
+		  const char* name, __u8 ns, __le16 attr)
 {
 	int result;
 
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
 
 int
 ncp_del_file_or_subdir(struct ncp_server *server,
-		       struct inode *dir, char *name)
+		       struct inode *dir, const char *name)
 {
 	__u8  volnum = NCP_FINFO(dir)->volNumber;
 	__le32 dirent = NCP_FINFO(dir)->dirEntNum;
+	int name_space;
 
+	name_space = server->name_space[volnum];
 #ifdef CONFIG_NCPFS_NFS_NS
-	if (server->name_space[volnum]==NW_NS_NFS)
+	if (name_space == NW_NS_NFS)
  	{
  		int result;
  
- 		result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent);
+		result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
  		if (result) return result;
- 		return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+		name = NULL;
+		name_space = NW_NS_DOS;
  	}
- 	else
 #endif	/* CONFIG_NCPFS_NFS_NS */
- 		return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006));
+	return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
 }
 
 static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
 /* If both dir and name are NULL, then in target there's already a
    looked-up entry that wants to be opened. */
 int ncp_open_create_file_or_subdir(struct ncp_server *server,
-				   struct inode *dir, char *name,
+				   struct inode *dir, const char *name,
 				   int open_create_mode,
 				   __le32 create_attributes,
 				   __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
 
 static int
 ncp_RenameNSEntry(struct ncp_server *server,
-		  struct inode *old_dir, char *old_name, __le16 old_type,
-		  struct inode *new_dir, char *new_name)
+		  struct inode *old_dir, const char *old_name, __le16 old_type,
+		  struct inode *new_dir, const char *new_name)
 {
 	int result = -EINVAL;
 
@@ -929,8 +940,8 @@ out:
 }
 
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-				struct inode *old_dir, char *old_name,
-				struct inode *new_dir, char *new_name)
+				struct inode *old_dir, const char *old_name,
+				struct inode *new_dir, const char *new_name)
 {
         int result;
         __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
 ncp_read_kernel(struct ncp_server *server, const char *file_id,
 	     __u32 offset, __u16 to_read, char *target, int *bytes_read)
 {
-	char *source;
+	const char *source;
 	int result;
 
 	ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57d..3c57eca634c 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
 	atomic_dec(&NCP_FINFO(inode)->opened);
 }
 
-void ncp_extract_file_info(void* src, struct nw_info_struct* target);
-int ncp_obtain_info(struct ncp_server *server, struct inode *, char *,
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
 		struct nw_info_struct *target);
 int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
 int ncp_get_volume_root(struct ncp_server *server, const char *volname,
 			__u32 *volume, __le32 *dirent, __le32 *dosdirent);
 int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
 			__u32 mode, __u32 rdev);
 
 int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
-int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *);
-int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *,
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
 				int, __le32, __le16, struct ncp_entry_info *);
 
 int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
 			   char** rbuf, size_t* rsize);
 
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-			      struct inode *, char *, struct inode *, char *);
+			      struct inode *, const char *, struct inode *, const char *);
 
 
 int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
 #endif /* CONFIG_NCPFS_NLS */
 
 #define NCP_GET_AGE(dentry)	(jiffies - (dentry)->d_time)
-#define NCP_MAX_AGE(server)	((server)->dentry_ttl)
+#define NCP_MAX_AGE(server)	atomic_read(&(server)->dentry_ttl)
 #define NCP_TEST_AGE(server,dentry)	(NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
 
 static inline void
 ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
 {
-	dentry->d_time = jiffies - server->dentry_ttl;
+	dentry->d_time = jiffies - NCP_MAX_AGE(server);
 }
 
 static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6c..d8b2d7e6910 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
 
 /* i386: 32-bit, little endian, handles mis-alignment */
 #ifdef __i386__
-#define GET_LE32(p) (*(int *)(p))
+#define GET_LE32(p) (*(const int *)(p))
 #define PUT_LE32(p,v) { *(int *)(p)=v; }
 #else
 /* from include/ncplib.h */
-#define BVAL(buf,pos) (((__u8 *)(buf))[pos])
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
 #define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
-#define BSET(buf,pos,val) (BVAL(buf,pos) = (val))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
 
 static inline __u16
-WVAL_LH(__u8 * buf, int pos)
+WVAL_LH(const __u8 * buf, int pos)
 {
 	return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
 }
 static inline __u32
-DVAL_LH(__u8 * buf, int pos)
+DVAL_LH(const __u8 * buf, int pos)
 {
 	return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
 }
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6..668bd267346 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
 		return -EIO;
 	}
 	if (!ncp_conn_valid(server)) {
-		printk(KERN_ERR "ncpfs: Connection invalid!\n");
 		return -EIO;
 	}
 	{
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b92..ba306658a6d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,9 +61,9 @@ config NFS_V3_ACL
 	  If unsure, say N.
 
 config NFS_V4
-	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFS_FS && EXPERIMENTAL
-	select RPCSEC_GSS_KRB5
+	bool "NFS client support for NFS version 4"
+	depends on NFS_FS
+	select SUNRPC_GSS
 	help
 	  This option enables support for version 4 of the NFS protocol
 	  (RFC 3530) in the kernel's NFS client.
@@ -72,16 +72,20 @@ config NFS_V4
 	  space programs which can be found in the Linux nfs-utils package,
 	  available from http://linux-nfs.org/.
 
-	  If unsure, say N.
+	  If unsure, say Y.
 
 config NFS_V4_1
-	bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
-	depends on NFS_V4 && EXPERIMENTAL
+	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
+	depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+	select PNFS_FILE_LAYOUT
 	help
 	  This option enables support for minor version 1 of the NFSv4 protocol
-	  (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+	  (RFC 5661) in the kernel's NFS client.
+
+	  If unsure, say N.
 
-	  Unless you're an NFS developer, say N.
+config PNFS_FILE_LAYOUT
+	tristate
 
 config ROOT_NFS
 	bool "Root file system on NFS"
@@ -100,3 +104,31 @@ config NFS_FSCACHE
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
 	  the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+	bool "Use the legacy NFS DNS resolver"
+	depends on NFS_V4
+	help
+	  The kernel now provides a method for translating a host name into an
+	  IP address.  Select Y here if you would rather use your own DNS
+	  resolver script.
+
+	  If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+	bool
+	depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+	select DNS_RESOLVER
+	select KEYS
+	default y
+
+config NFS_USE_NEW_IDMAPPER
+	bool "Use the new idmapper upcall routine"
+	depends on NFS_V4 && KEYS
+	help
+	  Say Y here if you want NFS to use the new idmapper upcall functions.
+	  You will need /sbin/request-key (usually provided by the keyutils
+	  package).  For details, read
+	  <file:Documentation/filesystems/nfs/idmapper.txt>.
+
+	  If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639ea..4776ff9e381 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae9512..aeec017fe81 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
 #define NFS_CALLBACK_MAXPORTNR (65535U)
 
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
 	unsigned long num;
 	int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
 	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
-
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
@@ -110,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
 	int ret;
 
-	ret = svc_create_xprt(serv, "tcp", PF_INET,
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_err;
@@ -118,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
 			nfs_callback_tcpport, PF_INET);
 
-	ret = svc_create_xprt(serv, "tcp", PF_INET6,
+	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret > 0) {
 		nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e85..2950fca0c61 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 	if (inode == NULL)
 		goto out_putclient;
 	nfsi = NFS_I(inode);
-	down_read(&nfsi->rwsem);
-	delegation = nfsi->delegation;
+	rcu_read_lock();
+	delegation = rcu_dereference(nfsi->delegation);
 	if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
 		goto out_iput;
 	res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 		args->bitmap[1];
 	res->status = 0;
 out_iput:
-	up_read(&nfsi->rwsem);
+	rcu_read_unlock();
 	iput(inode);
 out_putclient:
 	nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
 	return res->status;
 }
 
-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
-{
-#if defined(CONFIG_NFS_V4_1)
-	if (clp->cl_minorversion > 0)
-		return nfs41_validate_delegation_stateid;
-#endif
-	return nfs4_validate_delegation_stateid;
-}
-
-
 __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
 	struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 		inode = nfs_delegation_find_inode(clp, &args->fh);
 		if (inode != NULL) {
 			/* Set up a helper thread to actually return the delegation */
-			switch (nfs_async_inode_return_delegation(inode, &args->stateid,
-								  nfs_validate_delegation_stateid(clp))) {
+			switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
 				case 0:
 					res = 0;
 					break;
@@ -129,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
 	if (delegation == NULL)
 		return 0;
 
-	/* seqid is 4-bytes long */
-	if (((u32 *) &stateid->data)[0] != 0)
+	if (stateid->stateid.seqid != 0)
 		return 0;
-	if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
-		   sizeof(stateid->data)-4))
+	if (memcmp(&delegation->stateid.stateid.other,
+		   &stateid->stateid.other,
+		   NFS4_STATEID_OTHER_SIZE))
 		return 0;
 
 	return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a..0870d0d4efc 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
@@ -150,11 +151,14 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
+	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
 	cred = rpc_lookup_machine_cred();
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
-
+#if defined(CONFIG_NFS_V4_1)
+	INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
 	nfs_fscache_get_client_cookie(clp);
 
 	return clp;
@@ -178,7 +182,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 		clp->cl_session = NULL;
 	}
 
-	clp->cl_call_sync = _nfs4_call_sync;
+	clp->cl_mvops = nfs_v4_minor_ops[0];
 #endif /* CONFIG_NFS_V4_1 */
 }
 
@@ -188,7 +192,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-		nfs_callback_down(clp->cl_minorversion);
+		nfs_callback_down(clp->cl_mvops->minor_version);
 }
 
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -251,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
 		nfs_free_client(clp);
 	}
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
@@ -274,7 +279,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
 	    sin1->sin6_scope_id != sin2->sin6_scope_id)
 		return 0;
 
-	return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
 }
 #else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -600,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
@@ -634,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
  */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-	if (!(server->flags & NFS_MOUNT_NONLM))
+	if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+			!(server->flags & NFS_MOUNT_LOCAL_FCNTL))
 		nlmclnt_done(server->nlm_host);
 }
 
@@ -656,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
 
 	if (nlm_init.nfs_version > 3)
 		return 0;
-	if (server->flags & NFS_MOUNT_NONLM)
+	if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+			(server->flags & NFS_MOUNT_LOCAL_FCNTL))
 		return 0;
 
 	switch (clp->cl_proto) {
@@ -897,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
 		server->wsize = NFS_MAX_FILE_IO_SIZE;
 	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	set_pnfs_layoutdriver(server, fsinfo->layouttype);
+
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE)
-		server->dtsize = PAGE_CACHE_SIZE;
+	if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
+		server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
 	if (server->dtsize > server->rsize)
 		server->dtsize = server->rsize;
 
@@ -912,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->maxfilesize = fsinfo->maxfilesize;
 
+	server->time_delta = fsinfo->time_delta;
+
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
@@ -934,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
 	}
 
 	fsinfo.fattr = fattr;
+	fsinfo.layouttype = 0;
 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
 	if (error < 0)
 		goto out_error;
@@ -1016,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
 {
 	dprintk("--> nfs_free_server()\n");
 
+	unset_pnfs_layoutdriver(server);
 	spin_lock(&nfs_client_lock);
 	list_del(&server->client_link);
 	list_del(&server->master_link);
@@ -1126,7 +1140,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
 				return error;
 		}
 
-		error = nfs_callback_up(clp->cl_minorversion,
+		error = nfs_callback_up(clp->cl_mvops->minor_version,
 					clp->cl_rpcclient->cl_xprt);
 		if (error < 0) {
 			dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1157,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
  */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
-	clp->cl_call_sync = _nfs4_call_sync;
-
 #if defined(CONFIG_NFS_V4_1)
-	if (clp->cl_minorversion) {
+	if (clp->cl_mvops->minor_version) {
 		struct nfs4_session *session = NULL;
 		/*
 		 * Create the session and mark it expired.
@@ -1158,7 +1170,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 			return -ENOMEM;
 
 		clp->cl_session = session;
-		clp->cl_call_sync = _nfs4_call_sync_session;
+		/*
+		 * The create session reply races with the server back
+		 * channel probe. Mark the client NFS_CS_SESSION_INITING
+		 * so that the client back channel can find the
+		 * nfs_client struct
+		 */
+		clp->cl_cons_state = NFS_CS_SESSION_INITING;
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -1351,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
-		NFS_CAP_POSIX_LOCK;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
+	if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
 	server->options = data->options;
 
 	/* Get a client record */
@@ -1454,7 +1473,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
 				parent_server->client->cl_timeout,
-				parent_client->cl_minorversion);
+				parent_client->cl_mvops->minor_version);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 30163454397..232a7eead33 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	if (inode->i_flock == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the BKL */
-	lock_kernel();
+	/* Protect inode->i_flock using the file locks lock */
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		unlock_kernel();
+		unlock_flocks();
 		status = nfs4_lock_delegation_recall(state, fl);
 		if (status < 0)
 			goto out;
-		lock_kernel();
+		lock_flocks();
 	}
-	unlock_kernel();
+	unlock_flocks();
 out:
 	return status;
 }
@@ -268,14 +268,6 @@ out:
 	return status;
 }
 
-/* Sync all data to disk upon delegation return */
-static void nfs_msync_inode(struct inode *inode)
-{
-	filemap_fdatawrite(inode->i_mapping);
-	nfs_wb_all(inode);
-	filemap_fdatawait(inode->i_mapping);
-}
-
 /*
  * Basic procedure for returning a delegation to the server
  */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
 		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
 		spin_unlock(&clp->cl_lock);
 		if (delegation != NULL) {
-			nfs_msync_inode(inode);
+			nfs_wb_all(inode);
 			err = __nfs_inode_return_delegation(inode, delegation, 1);
 		}
 	}
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /*
  * Asynchronous delegation recall!
  */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
-				      int (*validate_stateid)(struct nfs_delegation *delegation,
-							      const nfs4_stateid *stateid))
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
 
-	if (!validate_stateid(delegation, stateid)) {
+	if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
 		rcu_read_unlock();
 		return -ENOENT;
 	}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b814012..2026304bda1 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
-				      int (*validate_stateid)(struct nfs_delegation *delegation,
-							      const nfs4_stateid *stateid));
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e60416d3f81..07ac3847e56 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
-#include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 /* #define NFS_DEBUG_VERBOSE 1 */
 
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static int nfs_readdir_clear_array(struct page*, gfp_t);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 };
 
+const struct address_space_operations nfs_dir_addr_space_ops = {
+	.releasepage = nfs_readdir_clear_array,
+};
+
 #ifdef CONFIG_NFS_V3
 const struct inode_operations nfs3_dir_inode_operations = {
 	.create		= nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
 #ifdef CONFIG_NFS_V4
 
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
 const struct inode_operations nfs4_dir_inode_operations = {
-	.create		= nfs_create,
+	.create		= nfs_open_create,
 	.lookup		= nfs_atomic_lookup,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
@@ -140,54 +147,207 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	/* Call generic open code in order to cache credentials */
 	res = nfs_open(inode, filp);
+	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+		/* This is a mountpoint, so d_revalidate will never
+		 * have been called, so we need to refresh the
+		 * inode (for close-open consistency) ourselves.
+		 */
+		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	}
 	return res;
 }
 
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+struct nfs_cache_array_entry {
+	u64 cookie;
+	u64 ino;
+	struct qstr string;
+};
+
+struct nfs_cache_array {
+	unsigned int size;
+	int eof_index;
+	u64 last_cookie;
+	struct nfs_cache_array_entry array[0];
+};
+
+#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
 	unsigned long	page_index;
-	__be32		*ptr;
 	u64		*dir_cookie;
 	loff_t		current_index;
-	struct nfs_entry *entry;
 	decode_dirent_t	decode;
-	int		plus;
+
 	unsigned long	timestamp;
 	unsigned long	gencount;
-	int		timestamp_valid;
+	unsigned int	cache_entry_index;
+	unsigned int	plus:1;
+	unsigned int	eof:1;
 } nfs_readdir_descriptor_t;
 
-/* Now we cache directories properly, by stuffing the dirent
- * data directly in the page cache.
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- *	 page-in of the RPC reply, nowhere else, this simplies
- *	 things substantially.
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
  */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+	if (page == NULL)
+		return ERR_PTR(-EIO);
+	return (struct nfs_cache_array *)kmap(page);
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+	kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	int i;
+	for (i = 0; i < array->size; i++)
+		kfree(array->array[i].string.name);
+	nfs_readdir_release_array(page);
+	return 0;
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+	string->len = len;
+	string->name = kmemdup(name, len, GFP_KERNEL);
+	if (string->name == NULL)
+		return -ENOMEM;
+	string->hash = full_name_hash(name, len);
+	return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+	struct nfs_cache_array *array = nfs_readdir_get_array(page);
+	struct nfs_cache_array_entry *cache_entry;
+	int ret;
+
+	if (IS_ERR(array))
+		return PTR_ERR(array);
+	ret = -EIO;
+	if (array->size >= MAX_READDIR_ARRAY)
+		goto out;
+
+	cache_entry = &array->array[array->size];
+	cache_entry->cookie = entry->prev_cookie;
+	cache_entry->ino = entry->ino;
+	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+	if (ret)
+		goto out;
+	array->last_cookie = entry->cookie;
+	if (entry->eof == 1)
+		array->eof_index = array->size;
+	array->size++;
+out:
+	nfs_readdir_release_array(page);
+	return ret;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	loff_t diff = desc->file->f_pos - desc->current_index;
+	unsigned int index;
+
+	if (diff < 0)
+		goto out_eof;
+	if (diff >= array->size) {
+		if (array->eof_index > 0)
+			goto out_eof;
+		desc->current_index += array->size;
+		return -EAGAIN;
+	}
+
+	index = (unsigned int)diff;
+	*desc->dir_cookie = array->array[index].cookie;
+	desc->cache_entry_index = index;
+	if (index == array->eof_index)
+		desc->eof = 1;
+	return 0;
+out_eof:
+	desc->eof = 1;
+	return -EBADCOOKIE;
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+	int i;
+	int status = -EAGAIN;
+
+	for (i = 0; i < array->size; i++) {
+		if (i == array->eof_index) {
+			desc->eof = 1;
+			status = -EBADCOOKIE;
+		}
+		if (array->array[i].cookie == *desc->dir_cookie) {
+			desc->cache_entry_index = i;
+			status = 0;
+			break;
+		}
+	}
+
+	return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+	struct nfs_cache_array *array;
+	int status = -EBADCOOKIE;
+
+	if (desc->dir_cookie == NULL)
+		goto out;
+
+	array = nfs_readdir_get_array(desc->page);
+	if (IS_ERR(array)) {
+		status = PTR_ERR(array);
+		goto out;
+	}
+
+	if (*desc->dir_cookie == 0)
+		status = nfs_readdir_search_for_pos(array, desc);
+	else
+		status = nfs_readdir_search_for_cookie(array, desc);
+
+	nfs_readdir_release_array(desc->page);
+out:
+	return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+			struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-	struct file	*file = desc->file;
-	struct inode	*inode = file->f_path.dentry->d_inode;
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
-	dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-			__func__, (long long)desc->entry->cookie,
-			page->index);
-
  again:
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
-	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
 					  NFS_SERVER(inode)->dtsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -201,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	desc->timestamp_valid = 1;
-	SetPageUptodate(page);
-	/* Ensure consistent page alignment of the data.
-	 * Note: assumes we have exclusive access to this mapping either
-	 *	 through inode->i_mutex or some other mechanism.
-	 */
-	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-		/* Should never happen */
-		nfs_zap_mapping(inode, inode->i_mapping);
-	}
-	unlock_page(page);
-	return 0;
- error:
-	unlock_page(page);
-	return -EIO;
+error:
+	return error;
 }
 
-static inline
-int dir_decode(nfs_readdir_descriptor_t *desc)
+/* Fill in an entry based on the xdr code stored in desc->page */
+static
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-	__be32	*p = desc->ptr;
-	p = desc->decode(p, desc->entry, desc->plus);
+	__be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-	desc->ptr = p;
-	if (desc->timestamp_valid) {
-		desc->entry->fattr->time_start = desc->timestamp;
-		desc->entry->fattr->gencount = desc->gencount;
-	} else
-		desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
+
+	entry->fattr->time_start = desc->timestamp;
+	entry->fattr->gencount = desc->gencount;
 	return 0;
 }
 
-static inline
-void dir_page_release(nfs_readdir_descriptor_t *desc)
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-	kunmap(desc->page);
-	page_cache_release(desc->page);
-	desc->page = NULL;
-	desc->ptr = NULL;
+	struct nfs_inode *node;
+	if (dentry->d_inode == NULL)
+		goto different;
+	node = NFS_I(dentry->d_inode);
+	if (node->fh.size != entry->fh->size)
+		goto different;
+	if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+		goto different;
+	return 1;
+different:
+	return 0;
 }
 
-/*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 {
-	struct nfs_entry *entry = desc->entry;
-	int		loop_count = 0,
-			status;
+	struct qstr filename = {
+		.len = entry->len,
+		.name = entry->name,
+	};
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = parent->d_inode;
+	struct inode *inode;
 
-	while((status = dir_decode(desc)) == 0) {
-		dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
-				__func__, (unsigned long long)entry->cookie);
-		if (entry->prev_cookie == *desc->dir_cookie)
-			break;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
+	if (filename.name[0] == '.') {
+		if (filename.len == 1)
+			return;
+		if (filename.len == 2 && filename.name[1] == '.')
+			return;
+	}
+	filename.hash = full_name_hash(filename.name, filename.len);
+
+	dentry = d_lookup(parent, &filename);
+	if (dentry != NULL) {
+		if (nfs_same_file(dentry, entry)) {
+			nfs_refresh_inode(dentry->d_inode, entry->fattr);
+			goto out;
+		} else {
+			d_drop(dentry);
+			dput(dentry);
 		}
 	}
-	return status;
+
+	dentry = d_alloc(parent, &filename);
+	if (dentry == NULL)
+		return;
+
+	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+	if (IS_ERR(inode))
+		goto out;
+
+	alias = d_materialise_unique(dentry, inode);
+	if (IS_ERR(alias))
+		goto out;
+	else if (alias) {
+		nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+		dput(alias);
+	} else
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+	dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+				void *xdr_page, struct page *page, unsigned int buflen)
+{
+	struct xdr_stream stream;
+	struct xdr_buf buf;
+	__be32 *ptr = xdr_page;
+	int status;
+	struct nfs_cache_array *array;
+
+	buf.head->iov_base = xdr_page;
+	buf.head->iov_len = buflen;
+	buf.tail->iov_len = 0;
+	buf.page_base = 0;
+	buf.page_len = 0;
+	buf.buflen = buf.head->iov_len;
+	buf.len = buf.head->iov_len;
+
+	xdr_init_decode(&stream, &buf, ptr);
+
+
+	do {
+		status = xdr_decode(desc, entry, &stream);
+		if (status != 0)
+			break;
+
+		if (nfs_readdir_add_to_array(entry, page) == -1)
+			break;
+		if (desc->plus == 1)
+			nfs_prime_dcache(desc->file->f_path.dentry, entry);
+	} while (!entry->eof);
+
+	if (status == -EBADCOOKIE && entry->eof) {
+		array = nfs_readdir_get_array(page);
+		array->eof_index = array->size - 1;
+		status = 0;
+		nfs_readdir_release_array(page);
+	}
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+	unsigned int i;
+	for (i = 0; i < npages; i++)
+		put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+		unsigned int npages)
+{
+	vm_unmap_ram(ptr, npages);
+	nfs_readdir_free_pagearray(pages, npages);
 }
 
 /*
- * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the entry at offset 'desc->file->f_pos'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
  */
-static inline
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
+static
+void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-	struct nfs_entry *entry = desc->entry;
-	int		loop_count = 0,
-			status;
+	void *ptr;
+	unsigned int i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			goto out_freepages;
+		pages[i] = page;
+	}
 
-	for(;;) {
-		status = dir_decode(desc);
-		if (status)
-			break;
+	ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
+	if (!IS_ERR_OR_NULL(ptr))
+		return ptr;
+out_freepages:
+	nfs_readdir_free_pagearray(pages, i);
+	return NULL;
+}
+
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+	struct page *pages[NFS_MAX_READDIR_PAGES];
+	void *pages_ptr = NULL;
+	struct nfs_entry entry;
+	struct file	*file = desc->file;
+	struct nfs_cache_array *array;
+	int status = 0;
+	unsigned int array_size = ARRAY_SIZE(pages);
+
+	entry.prev_cookie = 0;
+	entry.cookie = *desc->dir_cookie;
+	entry.eof = 0;
+	entry.fh = nfs_alloc_fhandle();
+	entry.fattr = nfs_alloc_fattr();
+	if (entry.fh == NULL || entry.fattr == NULL)
+		goto out;
+
+	array = nfs_readdir_get_array(page);
+	memset(array, 0, sizeof(struct nfs_cache_array));
+	array->eof_index = -1;
 
-		dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
-				(unsigned long long)entry->cookie, desc->current_index);
+	pages_ptr = nfs_readdir_large_page(pages, array_size);
+	if (!pages_ptr)
+		goto out_release_array;
+	do {
+		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
 
-		if (desc->file->f_pos == desc->current_index) {
-			*desc->dir_cookie = entry->cookie;
+		if (status < 0)
 			break;
-		}
-		desc->current_index++;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
-	}
+		nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+	} while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+
+	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+	nfs_readdir_release_array(page);
+out:
+	nfs_free_fattr(entry.fattr);
+	nfs_free_fhandle(entry.fh);
 	return status;
 }
 
 /*
- * Find the given page, and call find_dirent() or find_dirent_index in
- * order to try to return the next entry.
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
  */
-static inline
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
 	struct inode	*inode = desc->file->f_path.dentry->d_inode;
-	struct page	*page;
-	int		status;
 
-	dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
-			__func__, desc->page_index,
-			(long long) *desc->dir_cookie);
+	if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+		goto error;
+	SetPageUptodate(page);
 
-	/* If we find the page in the page_cache, we cannot be sure
-	 * how fresh the data is, so we will ignore readdir_plus attributes.
-	 */
-	desc->timestamp_valid = 0;
-	page = read_cache_page(inode->i_mapping, desc->page_index,
-			       (filler_t *)nfs_readdir_filler, desc);
-	if (IS_ERR(page)) {
-		status = PTR_ERR(page);
-		goto out;
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+		/* Should never happen */
+		nfs_zap_mapping(inode, inode->i_mapping);
 	}
+	unlock_page(page);
+	return 0;
+ error:
+	unlock_page(page);
+	return -EIO;
+}
 
-	/* NOTE: Someone else may have changed the READDIRPLUS flag */
-	desc->page = page;
-	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	if (*desc->dir_cookie != 0)
-		status = find_dirent(desc);
-	else
-		status = find_dirent_index(desc);
-	if (status < 0)
-		dir_page_release(desc);
- out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
-	return status;
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+	page_cache_release(desc->page);
+	desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	struct page *page;
+	page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+	if (IS_ERR(page))
+		desc->eof = 1;
+	return page;
 }
 
 /*
- * Recurse through the page cache pages, and return a
- * filled nfs_entry structure of the next directory entry if possible.
- *
- * The target for the search is '*desc->dir_cookie' if non-0,
- * 'desc->file->f_pos' otherwise
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
  */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+	int res;
+
+	desc->page = get_cache_page(desc);
+	if (IS_ERR(desc->page))
+		return PTR_ERR(desc->page);
+
+	res = nfs_readdir_search_array(desc);
+	if (res == 0)
+		return 0;
+	cache_page_release(desc);
+	return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-	int		loop_count = 0;
-	int		res;
-
-	/* Always search-by-index from the beginning of the cache */
-	if (*desc->dir_cookie == 0) {
-		dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
-				(long long)desc->file->f_pos);
-		desc->page_index = 0;
-		desc->entry->cookie = desc->entry->prev_cookie = 0;
-		desc->entry->eof = 0;
-		desc->current_index = 0;
-	} else
-		dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
-				(unsigned long long)*desc->dir_cookie);
+	int res = -EAGAIN;
 
-	for (;;) {
-		res = find_dirent_page(desc);
+	while (1) {
+		res = find_cache_page(desc);
 		if (res != -EAGAIN)
 			break;
-		/* Align to beginning of next page */
-		desc->page_index ++;
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
+		desc->page_index++;
 	}
-
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
 	return res;
 }
 
@@ -393,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
 	return (inode->i_mode >> 12) & 15;
 }
 
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
-
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
@@ -403,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		   filldir_t filldir)
 {
 	struct file	*file = desc->file;
-	struct nfs_entry *entry = desc->entry;
-	struct dentry	*dentry = NULL;
-	u64		fileid;
-	int		loop_count = 0,
-			res;
-
-	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
-			(unsigned long long)entry->cookie);
-
-	for(;;) {
-		unsigned d_type = DT_UNKNOWN;
-		/* Note: entry->prev_cookie contains the cookie for
-		 *	 retrieving the current dirent on the server */
-		fileid = entry->ino;
-
-		/* Get a dentry if we have one */
-		if (dentry != NULL)
-			dput(dentry);
-		dentry = nfs_readdir_lookup(desc);
+	int i = 0;
+	int res = 0;
+	struct nfs_cache_array *array = NULL;
+	unsigned int d_type = DT_UNKNOWN;
+	struct dentry *dentry = NULL;
 
-		/* Use readdirplus info */
-		if (dentry != NULL && dentry->d_inode != NULL) {
-			d_type = dt_type(dentry->d_inode);
-			fileid = NFS_FILEID(dentry->d_inode);
-		}
+	array = nfs_readdir_get_array(desc->page);
 
-		res = filldir(dirent, entry->name, entry->len, 
-			      file->f_pos, nfs_compat_user_ino64(fileid),
-			      d_type);
+	for (i = desc->cache_entry_index; i < array->size; i++) {
+		d_type = DT_UNKNOWN;
+
+		res = filldir(dirent, array->array[i].string.name,
+			array->array[i].string.len, file->f_pos,
+			nfs_compat_user_ino64(array->array[i].ino), d_type);
 		if (res < 0)
 			break;
 		file->f_pos++;
-		*desc->dir_cookie = entry->cookie;
-		if (dir_decode(desc) != 0) {
-			desc->page_index ++;
+		desc->cache_entry_index = i;
+		if (i < (array->size-1))
+			*desc->dir_cookie = array->array[i+1].cookie;
+		else
+			*desc->dir_cookie = array->last_cookie;
+		if (i == array->eof_index) {
+			desc->eof = 1;
 			break;
 		}
-		if (loop_count++ > 200) {
-			loop_count = 0;
-			schedule();
-		}
 	}
-	dir_page_release(desc);
+
+	nfs_readdir_release_array(desc->page);
+	cache_page_release(desc);
 	if (dentry != NULL)
 		dput(dentry);
 	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -469,12 +716,9 @@ static inline
 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		     filldir_t filldir)
 {
-	struct file	*file = desc->file;
-	struct inode	*inode = file->f_path.dentry->d_inode;
-	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct page	*page = NULL;
 	int		status;
-	unsigned long	timestamp, gencount;
+	struct inode *inode = desc->file->f_path.dentry->d_inode;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)*desc->dir_cookie);
@@ -484,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		status = -ENOMEM;
 		goto out;
 	}
-	timestamp = jiffies;
-	gencount = nfs_inc_attr_generation_counter();
-	status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
-						*desc->dir_cookie, page,
-						NFS_SERVER(inode)->dtsize,
-						desc->plus);
-	desc->page = page;
-	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	if (status >= 0) {
-		desc->timestamp = timestamp;
-		desc->gencount = gencount;
-		desc->timestamp_valid = 1;
-		if ((status = dir_decode(desc)) == 0)
-			desc->entry->prev_cookie = *desc->dir_cookie;
-	} else
+
+	if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
 		status = -EIO;
-	if (status < 0)
 		goto out_release;
+	}
 
+	desc->page_index = 0;
+	desc->page = page;
 	status = nfs_do_filldir(desc, dirent, filldir);
 
-	/* Reset read descriptor so it searches the page cache from
-	 * the start upon the next call to readdir_search_pagecache() */
-	desc->page_index = 0;
-	desc->entry->cookie = desc->entry->prev_cookie = 0;
-	desc->entry->eof = 0;
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
 			__func__, status);
 	return status;
  out_release:
-	dir_page_release(desc);
+	cache_page_release(desc);
 	goto out;
 }
 
@@ -529,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode	*inode = dentry->d_inode;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
-	struct nfs_entry my_entry;
 	int res = -ENOMEM;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -550,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
 
-	my_entry.cookie = my_entry.prev_cookie = 0;
-	my_entry.eof = 0;
-	my_entry.fh = nfs_alloc_fhandle();
-	my_entry.fattr = nfs_alloc_fattr();
-	if (my_entry.fh == NULL || my_entry.fattr == NULL)
-		goto out_alloc_failed;
-
-	desc->entry = &my_entry;
-
 	nfs_block_sillyrename(dentry);
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0)
 		goto out;
 
-	while(!desc->entry->eof) {
+	while (desc->eof != 1) {
 		res = readdir_search_pagecache(desc);
 
 		if (res == -EBADCOOKIE) {
 			/* This means either end of directory */
-			if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+			if (*desc->dir_cookie && desc->eof == 0) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc, dirent, filldir);
 				if (res >= 0)
@@ -581,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (res == -ETOOSMALL && desc->plus) {
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			nfs_zap_caches(inode);
+			desc->page_index = 0;
 			desc->plus = 0;
-			desc->entry->eof = 0;
+			desc->eof = 0;
 			continue;
 		}
 		if (res < 0)
@@ -598,9 +817,6 @@ out:
 	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
 		res = 0;
-out_alloc_failed:
-	nfs_free_fattr(my_entry.fattr);
-	nfs_free_fhandle(my_entry.fh);
 	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			res);
@@ -1022,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
 	return 1;
 }
 
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+	struct path path = {
+		.mnt = nd->path.mnt,
+		.dentry = dentry,
+	};
+	struct nfs_open_context *ctx;
+	struct rpc_cred *cred;
+	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return ERR_CAST(cred);
+	ctx = alloc_nfs_open_context(&path, cred, fmode);
+	put_rpccred(cred);
+	if (ctx == NULL)
+		return ERR_PTR(-ENOMEM);
+	return ctx;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+	nfs_fscache_set_inode_cookie(inode, filp);
+	return 0;
+}
+
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+	struct file *filp;
+	int ret = 0;
+
+	/* If the open_intent is for execute, we have an extra check to make */
+	if (ctx->mode & FMODE_EXEC) {
+		ret = nfs_may_open(ctx->path.dentry->d_inode,
+				ctx->cred,
+				nd->intent.open.flags);
+		if (ret < 0)
+			goto out;
+	}
+	filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+	if (IS_ERR(filp))
+		ret = PTR_ERR(filp);
+	else
+		nfs_file_set_open_context(filp, ctx);
+out:
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
 static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
+	struct nfs_open_context *ctx;
+	struct iattr attr;
 	struct dentry *res = NULL;
-	int error;
+	struct inode *inode;
+	int open_flags;
+	int err;
 
 	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1047,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		goto out;
 	}
 
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	res = ERR_CAST(ctx);
+	if (IS_ERR(ctx))
+		goto out;
+
+	open_flags = nd->intent.open.flags;
+	if (nd->flags & LOOKUP_CREATE) {
+		attr.ia_mode = nd->intent.open.create_mode;
+		attr.ia_valid = ATTR_MODE;
+		if (!IS_POSIXACL(dir))
+			attr.ia_mode &= ~current_umask();
+	} else {
+		open_flags &= ~(O_EXCL | O_CREAT);
+		attr.ia_valid = 0;
+	}
+
 	/* Open the file on the server */
-	res = nfs4_atomic_open(dir, dentry, nd);
-	if (IS_ERR(res)) {
-		error = PTR_ERR(res);
-		switch (error) {
+	nfs_block_sillyrename(dentry->d_parent);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+	if (IS_ERR(inode)) {
+		nfs_unblock_sillyrename(dentry->d_parent);
+		put_nfs_open_context(ctx);
+		switch (PTR_ERR(inode)) {
 			/* Make a negative dentry */
 			case -ENOENT:
+				d_add(dentry, NULL);
 				res = NULL;
 				goto out;
 			/* This turned out not to be a regular file */
@@ -1065,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 					goto no_open;
 			/* case -EINVAL: */
 			default:
+				res = ERR_CAST(inode);
 				goto out;
 		}
-	} else if (res != NULL)
+	}
+	res = d_add_unique(dentry, inode);
+	nfs_unblock_sillyrename(dentry->d_parent);
+	if (res != NULL) {
+		dput(ctx->path.dentry);
+		ctx->path.dentry = dget(res);
 		dentry = res;
+	}
+	err = nfs_intent_set_file(nd, ctx);
+	if (err < 0) {
+		if (res != NULL)
+			dput(res);
+		return ERR_PTR(err);
+	}
 out:
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return res;
 no_open:
 	return nfs_lookup(dir, dentry, nd);
@@ -1080,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct dentry *parent = NULL;
 	struct inode *inode = dentry->d_inode;
 	struct inode *dir;
+	struct nfs_open_context *ctx;
 	int openflags, ret = 0;
 
 	if (!is_atomic_open(nd) || d_mountpoint(dentry))
 		goto no_open;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
+
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
@@ -1103,101 +1408,98 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
 		goto no_open_dput;
 	/* We can't create new files, or truncate existing ones here */
-	openflags &= ~(O_CREAT|O_TRUNC);
+	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
 
+	ctx = nameidata_to_nfs_open_context(dentry, nd);
+	ret = PTR_ERR(ctx);
+	if (IS_ERR(ctx))
+		goto out;
 	/*
 	 * Note: we're not holding inode->i_mutex and so may be racing with
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
-	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		switch (ret) {
+		case -EPERM:
+		case -EACCES:
+		case -EDQUOT:
+		case -ENOSPC:
+		case -EROFS:
+			goto out_put_ctx;
+		default:
+			goto out_drop;
+		}
+	}
+	iput(inode);
+	if (inode != dentry->d_inode)
+		goto out_drop;
+
+	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+	ret = nfs_intent_set_file(nd, ctx);
+	if (ret >= 0)
+		ret = 1;
 out:
 	dput(parent);
-	if (!ret)
-		d_drop(dentry);
 	return ret;
+out_drop:
+	d_drop(dentry);
+	ret = 0;
+out_put_ctx:
+	put_nfs_open_context(ctx);
+	goto out;
+
 no_open_dput:
 	dput(parent);
 no_open:
 	return nfs_lookup_revalidate(dentry, nd);
 }
-#endif /* CONFIG_NFSV4 */
 
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
-	struct dentry *parent = desc->file->f_path.dentry;
-	struct inode *dir = parent->d_inode;
-	struct nfs_entry *entry = desc->entry;
-	struct dentry *dentry, *alias;
-	struct qstr name = {
-		.name = entry->name,
-		.len = entry->len,
-	};
-	struct inode *inode;
-	unsigned long verf = nfs_save_change_attribute(dir);
+	struct nfs_open_context *ctx = NULL;
+	struct iattr attr;
+	int error;
+	int open_flags = 0;
 
-	switch (name.len) {
-		case 2:
-			if (name.name[0] == '.' && name.name[1] == '.')
-				return dget_parent(parent);
-			break;
-		case 1:
-			if (name.name[0] == '.')
-				return dget(parent);
-	}
+	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
-	spin_lock(&dir->i_lock);
-	if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
-		spin_unlock(&dir->i_lock);
-		return NULL;
-	}
-	spin_unlock(&dir->i_lock);
+	attr.ia_mode = mode;
+	attr.ia_valid = ATTR_MODE;
 
-	name.hash = full_name_hash(name.name, name.len);
-	dentry = d_lookup(parent, &name);
-	if (dentry != NULL) {
-		/* Is this a positive dentry that matches the readdir info? */
-		if (dentry->d_inode != NULL &&
-				(NFS_FILEID(dentry->d_inode) == entry->ino ||
-				d_mountpoint(dentry))) {
-			if (!desc->plus || entry->fh->size == 0)
-				return dentry;
-			if (nfs_compare_fh(NFS_FH(dentry->d_inode),
-						entry->fh) == 0)
-				goto out_renew;
-		}
-		/* No, so d_drop to allow one to be created */
-		d_drop(dentry);
-		dput(dentry);
-	}
-	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
-		return NULL;
-	if (name.len > NFS_SERVER(dir)->namelen)
-		return NULL;
-	/* Note: caller is already holding the dir->i_mutex! */
-	dentry = d_alloc(parent, &name);
-	if (dentry == NULL)
-		return NULL;
-	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
-	if (IS_ERR(inode)) {
-		dput(dentry);
-		return NULL;
-	}
+	if ((nd->flags & LOOKUP_CREATE) != 0) {
+		open_flags = nd->intent.open.flags;
 
-	alias = d_materialise_unique(dentry, inode);
-	if (alias != NULL) {
-		dput(dentry);
-		if (IS_ERR(alias))
-			return NULL;
-		dentry = alias;
+		ctx = nameidata_to_nfs_open_context(dentry, nd);
+		error = PTR_ERR(ctx);
+		if (IS_ERR(ctx))
+			goto out_err_drop;
 	}
 
-out_renew:
-	nfs_set_verifier(dentry, verf);
-	return dentry;
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+	if (error != 0)
+		goto out_put_ctx;
+	if (ctx != NULL) {
+		error = nfs_intent_set_file(nd, ctx);
+		if (error < 0)
+			goto out_err;
+	}
+	return 0;
+out_put_ctx:
+	if (ctx != NULL)
+		put_nfs_open_context(ctx);
+out_err_drop:
+	d_drop(dentry);
+out_err:
+	return error;
 }
 
+#endif /* CONFIG_NFSV4 */
+
 /*
  * Code common to create, mkdir, and mknod.
  */
@@ -1251,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
 	struct iattr attr;
 	int error;
-	int open_flags = 0;
 
 	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1259,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-	if ((nd->flags & LOOKUP_CREATE) != 0)
-		open_flags = nd->intent.open.flags;
-
-	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
+	error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
 	if (error != 0)
 		goto out_err;
 	return 0;
@@ -1344,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
-	static unsigned int sillycounter;
-	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
-	const int      countersize = sizeof(sillycounter)*2;
-	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
-	char           silly[slen+1];
-	struct qstr    qsilly;
-	struct dentry *sdentry;
-	int            error = -EIO;
-
-	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name, 
-		atomic_read(&dentry->d_count));
-	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-
-	/*
-	 * We don't allow a dentry to be silly-renamed twice.
-	 */
-	error = -EBUSY;
-	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
-		goto out;
-
-	sprintf(silly, ".nfs%*.*Lx",
-		fileidsize, fileidsize,
-		(unsigned long long)NFS_FILEID(dentry->d_inode));
-
-	/* Return delegation in anticipation of the rename */
-	nfs_inode_return_delegation(dentry->d_inode);
-
-	sdentry = NULL;
-	do {
-		char *suffix = silly + slen - countersize;
-
-		dput(sdentry);
-		sillycounter++;
-		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-
-		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-				dentry->d_name.name, silly);
-		
-		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
-		/*
-		 * N.B. Better to return EBUSY here ... it could be
-		 * dangerous to delete the file while it's in use.
-		 */
-		if (IS_ERR(sdentry))
-			goto out;
-	} while(sdentry->d_inode != NULL); /* need negative lookup */
-
-	qsilly.name = silly;
-	qsilly.len  = strlen(silly);
-	if (dentry->d_inode) {
-		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-				dir, &qsilly);
-		nfs_mark_for_revalidate(dentry->d_inode);
-	} else
-		error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-				dir, &qsilly);
-	if (!error) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		d_move(dentry, sdentry);
-		error = nfs_async_unlink(dir, dentry);
- 		/* If we return 0 we don't unlink */
-	}
-	dput(sdentry);
-out:
-	return error;
-}
-
 /*
  * Remove a file after making sure there are no pending writes,
  * and after checking that the file has only one user. 
@@ -1573,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 	d_drop(dentry);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		d_add(dentry, inode);
 	}
 	return error;
@@ -1652,16 +1880,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 
-	/*
-	 * ... prune child dentries and writebacks if needed.
-	 */
-	if (atomic_read(&old_dentry->d_count) > 1) {
-		if (S_ISREG(old_inode->i_mode))
-			nfs_wb_all(old_inode);
-		shrink_dcache_parent(old_dentry);
-	}
 	nfs_inode_return_delegation(old_inode);
-
 	if (new_inode != NULL)
 		nfs_inode_return_delegation(new_inode);
 
@@ -1713,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
 int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(head);
-	struct nfs_inode *nfsi;
+	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
 		return (nr_to_scan == 0) ? 0 : -1;
 
 	spin_lock(&nfs_access_lru_lock);
-	list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
 		struct inode *inode;
 
 		if (nr_to_scan-- == 0)
@@ -1953,7 +2172,7 @@ int nfs_permission(struct inode *inode, int mask)
 	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		goto out;
 	/* Is this sys_access() ? */
-	if (mask & MAY_ACCESS)
+	if (mask & (MAY_ACCESS | MAY_CHDIR))
 		goto force_lookup;
 
 	switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d605..84d3c8b9020 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
 
 	/* I/O parameters */
 	struct nfs_open_context	*ctx;		/* file open context info */
+	struct nfs_lock_context *l_ctx;		/* Lock context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
 
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
+	dreq->l_ctx = NULL;
 	spin_lock_init(&dreq->lock);
 	atomic_set(&dreq->io_count, 0);
 	dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
 {
 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
+	if (dreq->l_ctx != NULL)
+		nfs_put_lock_context(dreq->l_ctx);
 	if (dreq->ctx != NULL)
 		put_nfs_open_context(dreq->ctx);
 	kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 
 	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		return -ENOMEM;
+	if (dreq == NULL)
+		goto out;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.offset = 0;
 	data->args.count = 0;
 	data->args.context = dreq->ctx;
+	data->args.lock_context = dreq->l_ctx;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos,
 				size_t count)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		return -ENOMEM;
+		goto out;
 	nfs_alloc_commit_data(dreq);
 
 	if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d002..a6e711ad130 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
  * Resolves DNS hostnames into valid ip addresses
  */
 
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+	char *ip_addr = NULL;
+	int ip_len;
+
+	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+	if (ip_len > 0)
+		ret = rpc_pton(ip_addr, ip_len, sa, salen);
+	else
+		ret = -ESRCH;
+	kfree(ip_addr);
+	return ret;
+}
+
+#else
+
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
@@ -144,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
 		return 0;
 	}
 	item = container_of(h, struct nfs_dns_ent, h);
-	ttl = (long)item->h.expiry_time - (long)get_seconds();
+	ttl = item->h.expiry_time - seconds_since_boot();
 	if (ttl < 0)
 		ttl = 0;
 
@@ -216,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
 	ttl = get_expiry(&buf);
 	if (ttl == 0)
 		goto out;
-	key.h.expiry_time = ttl + get_seconds();
+	key.h.expiry_time = ttl + seconds_since_boot();
 
 	ret = -ENOMEM;
 	item = nfs_dns_lookup(cd, &key);
@@ -278,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
 		goto out_err;
 	ret = -ETIMEDOUT;
 	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-			|| (*item)->h.expiry_time < get_seconds()
+			|| (*item)->h.expiry_time < seconds_since_boot()
 			|| cd->flush_time > (*item)->h.last_refresh)
 		goto out_put;
 	ret = -ENOENT;
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
 	nfs_cache_unregister(&nfs_dns_resolve);
 }
 
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf..199bb5543a9 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
 
 #define NFS_DNS_HOSTNAME_MAXLEN	(128)
 
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+	return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
 extern int nfs_dns_resolver_init(void);
 extern void nfs_dns_resolver_destroy(void);
+#endif
+
 extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
 		struct sockaddr *sa, size_t salen);
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f036153d9f5..60677f9f131 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
@@ -203,37 +204,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 }
 
 /*
- * Helper for nfs_file_flush() and nfs_file_fsync()
- *
- * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
- * disk, but it retrieves and clears ctx->error after synching, despite
- * the two being set at the same time in nfs_context_set_write_error().
- * This is because the former is used to notify the _next_ call to
- * nfs_file_write() that a write error occured, and hence cause it to
- * fall back to doing a synchronous write.
- */
-static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
-{
-	int have_error, status;
-	int ret = 0;
-
-	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	status = nfs_wb_all(inode);
-	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	if (have_error)
-		ret = xchg(&ctx->error, 0);
-	if (!ret)
-		ret = status;
-	return ret;
-}
-
-/*
  * Flush all dirty pages, and check for write errors.
  */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
 
@@ -246,7 +221,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 		return 0;
 
 	/* Flush writes to the server and return any errors */
-	return nfs_do_fsync(ctx, inode);
+	return vfs_fsync(file, 0);
 }
 
 static ssize_t
@@ -321,6 +296,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  * Flush any dirty pages for this process, and check for write errors.
  * The return status from this call provides a reliable indication of
  * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
  */
 static int
 nfs_file_fsync(struct file *file, int datasync)
@@ -328,13 +310,23 @@ nfs_file_fsync(struct file *file, int datasync)
 	struct dentry *dentry = file->f_path.dentry;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
+	int have_error, status;
+	int ret = 0;
+
 
 	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-	return nfs_do_fsync(ctx, inode);
+	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	status = nfs_commit_inode(inode, FLUSH_SYNC);
+	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	if (have_error)
+		ret = xchg(&ctx->error, 0);
+	if (!ret && status < 0)
+		ret = status;
+	return ret;
 }
 
 /*
@@ -395,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+	pnfs_update_layout(mapping->host,
+			   nfs_file_open_context(file),
+			   IOMODE_RW);
+
 start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
@@ -560,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE;
 	struct address_space *mapping;
 
 	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -576,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (mapping != dentry->d_inode->i_mapping)
 		goto out_unlock;
 
-	ret = 0;
 	pagelen = nfs_page_length(page);
 	if (pagelen == 0)
 		goto out_unlock;
 
-	ret = nfs_flush_incompatible(filp, page);
-	if (ret != 0)
-		goto out_unlock;
+	ret = VM_FAULT_LOCKED;
+	if (nfs_flush_incompatible(filp, page) == 0 &&
+	    nfs_updatepage(filp, page, 0, pagelen) == 0)
+		goto out;
 
-	ret = nfs_updatepage(filp, page, 0, pagelen);
+	ret = VM_FAULT_SIGBUS;
 out_unlock:
-	if (!ret)
-		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	return VM_FAULT_SIGBUS;
+out:
+	return ret;
 }
 
 static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -648,7 +643,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 
 	/* Return error values for O_DSYNC and IS_SYNC() */
 	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
+		int err = vfs_fsync(iocb->ki_filp, 0);
 		if (err < 0)
 			result = err;
 	}
@@ -684,7 +679,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 		written = ret;
 
 	if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
-		int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
+		int err = vfs_fsync(filp, 0);
 		if (err < 0)
 			ret = err;
 	}
@@ -693,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 	return ret;
 }
 
-static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
@@ -708,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 	if (nfs_have_delegation(inode, FMODE_READ))
 		goto out_noconflict;
 
-	if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+	if (is_local)
 		goto out_noconflict;
 
 	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -732,14 +728,11 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 		default:
 			BUG();
 	}
-	if (res < 0)
-		dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
-			" - error %d!\n",
-				__func__, res);
 	return res;
 }
 
-static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status;
@@ -754,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
 	 * 	If we're signalled while cleaning up locks on process exit, we
 	 * 	still need to complete the unlock.
 	 */
-	/* Use local locking if mounted with "-onolock" */
-	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
 	return status;
 }
 
-static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+is_time_granular(struct timespec *ts) {
+	return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status;
@@ -775,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
 	if (status != 0)
 		goto out;
 
-	/* Use local locking if mounted with "-onolock" */
-	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+	/*
+	 * Use local locking if mounted with "-onolock" or with appropriate
+	 * "-olocal_lock="
+	 */
+	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
 		status = do_vfs_lock(filp, fl);
 	if (status < 0)
 		goto out;
+
 	/*
-	 * Make sure we clear the cache whenever we try to get the lock.
+	 * Revalidate the cache if the server has time stamps granular
+	 * enough to detect subsecond changes.  Otherwise, clear the
+	 * cache to prevent missing any changes.
+	 *
 	 * This makes locking act as a cache coherency point.
 	 */
 	nfs_sync_mapping(filp->f_mapping);
-	if (!nfs_have_delegation(inode, FMODE_READ))
-		nfs_zap_caches(inode);
+	if (!nfs_have_delegation(inode, FMODE_READ)) {
+		if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+			__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		else
+			nfs_zap_caches(inode);
+	}
 out:
 	return status;
 }
@@ -800,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int ret = -ENOLCK;
+	int is_local = 0;
 
 	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
@@ -813,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		goto out_err;
 
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+		is_local = 1;
+
 	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
 		ret = NFS_PROTO(inode)->lock_check_bounds(fl);
 		if (ret < 0)
@@ -820,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	}
 
 	if (IS_GETLK(cmd))
-		ret = do_getlk(filp, cmd, fl);
+		ret = do_getlk(filp, cmd, fl, is_local);
 	else if (fl->fl_type == F_UNLCK)
-		ret = do_unlk(filp, cmd, fl);
+		ret = do_unlk(filp, cmd, fl, is_local);
 	else
-		ret = do_setlk(filp, cmd, fl);
+		ret = do_setlk(filp, cmd, fl, is_local);
 out_err:
 	return ret;
 }
@@ -834,6 +851,9 @@ out_err:
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
+	struct inode *inode = filp->f_mapping->host;
+	int is_local = 0;
+
 	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name,
@@ -842,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+		is_local = 1;
+
 	/* We're simulating flock() locks using posix locks on the server */
 	fl->fl_owner = (fl_owner_t)filp;
 	fl->fl_start = 0;
 	fl->fl_end = OFFSET_MAX;
 
 	if (fl->fl_type == F_UNLCK)
-		return do_unlk(filp, cmd, fl);
-	return do_setlk(filp, cmd, fl);
+		return do_unlk(filp, cmd, fl, is_local);
+	return do_setlk(filp, cmd, fl, is_local);
 }
 
 /*
@@ -861,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
 			file->f_path.dentry->d_parent->d_name.name,
 			file->f_path.dentry->d_name.name, arg);
-
 	return -EINVAL;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e160..ac7b814ce16 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 			iput(inode);
 			return -ENOMEM;
 		}
-		/* Circumvent igrab(): we know the inode is not being freed */
-		atomic_inc(&inode->i_count);
+		ihold(inode);
 		/*
 		 * Ensure that this dentry is invisible to d_find_alias().
 		 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916..4e2d9b6b138 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+
+#include <keys/user-type.h>
+
+#define NFS_UINT_MAXLEN 11
+
+const struct cred *id_resolver_cache;
+
+struct key_type key_type_id_resolver = {
+	.name		= "id_resolver",
+	.instantiate	= user_instantiate,
+	.match		= user_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+int nfs_idmap_init(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret = 0;
+
+	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+			     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			     KEY_USR_VIEW | KEY_USR_READ,
+			     KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_id_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	id_resolver_cache = cred;
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+	key_revoke(id_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_id_resolver);
+	put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+				const char *type, size_t typelen, char **desc)
+{
+	char *cp;
+	size_t desclen = typelen + namelen + 2;
+
+	*desc = kmalloc(desclen, GFP_KERNEL);
+	if (!*desc)
+		return -ENOMEM;
+
+	cp = *desc;
+	memcpy(cp, type, typelen);
+	cp += typelen;
+	*cp++ = ':';
+
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+	return desclen;
+}
+
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+		const char *type, void *data, size_t data_size)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	char *desc;
+	struct user_key_payload *payload;
+	ssize_t ret;
+
+	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+	if (ret <= 0)
+		goto out;
+
+	saved_cred = override_creds(id_resolver_cache);
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	rcu_read_lock();
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto out_up;
+
+	payload = rcu_dereference(rkey->payload.data);
+	if (IS_ERR_OR_NULL(payload)) {
+		ret = PTR_ERR(payload);
+		goto out_up;
+	}
+
+	ret = payload->datalen;
+	if (ret > 0 && ret <= data_size)
+		memcpy(data, payload->data, ret);
+	else
+		ret = -EINVAL;
+
+out_up:
+	rcu_read_unlock();
+	key_put(rkey);
+out:
+	return ret;
+}
+
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	int id_len;
+	ssize_t ret;
+
+	id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+	if (ret < 0)
+		return -EINVAL;
+	return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+				const char *type, __u32 *id)
+{
+	char id_str[NFS_UINT_MAXLEN];
+	long id_long;
+	ssize_t data_size;
+	int ret = 0;
+
+	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+	if (data_size <= 0) {
+		ret = -EINVAL;
+	} else {
+		ret = strict_strtol(id_str, 10, &id_long);
+		*id = (__u32)id_long;
+	}
+	return ret;
+}
+
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+
+#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
+
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
 
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518fee..314f5716460 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -98,7 +99,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-void nfs_clear_inode(struct inode *inode)
+static void nfs_clear_inode(struct inode *inode)
 {
 	/*
 	 * The following should never happen...
@@ -110,6 +111,13 @@ void nfs_clear_inode(struct inode *inode)
 	nfs_fscache_release_inode_cookie(inode);
 }
 
+void nfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	nfs_clear_inode(inode);
+}
+
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
  */
@@ -227,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
 	return 0;
 }
 
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
-
 /*
  * This is our front-end to iget that looks up inodes by file handle
  * instead of inode number.
@@ -284,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
-			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
-			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
+			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -413,10 +417,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return 0;
 
 	/* Write all dirty data */
-	if (S_ISREG(inode->i_mode)) {
-		filemap_write_and_wait(inode->i_mapping);
+	if (S_ISREG(inode->i_mode))
 		nfs_wb_all(inode);
-	}
 
 	fattr = nfs_alloc_fattr();
 	if (fattr == NULL)
@@ -530,6 +532,68 @@ out:
 	return err;
 }
 
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+	atomic_set(&l_ctx->count, 1);
+	l_ctx->lockowner = current->files;
+	l_ctx->pid = current->tgid;
+	INIT_LIST_HEAD(&l_ctx->list);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *pos;
+
+	list_for_each_entry(pos, &ctx->lock_context.list, list) {
+		if (pos->lockowner != current->files)
+			continue;
+		if (pos->pid != current->tgid)
+			continue;
+		atomic_inc(&pos->count);
+		return pos;
+	}
+	return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *res, *new = NULL;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	spin_lock(&inode->i_lock);
+	res = __nfs_find_lock_context(ctx);
+	if (res == NULL) {
+		spin_unlock(&inode->i_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		nfs_init_lock_context(new);
+		spin_lock(&inode->i_lock);
+		res = __nfs_find_lock_context(ctx);
+		if (res == NULL) {
+			list_add_tail(&new->list, &ctx->lock_context.list);
+			new->open_context = ctx;
+			res = new;
+			new = NULL;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	kfree(new);
+	return res;
+}
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+	struct nfs_open_context *ctx = l_ctx->open_context;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+		return;
+	list_del(&l_ctx->list);
+	spin_unlock(&inode->i_lock);
+	kfree(l_ctx);
+}
+
 /**
  * nfs_close_context - Common close_context() routine NFSv2/v3
  * @ctx: pointer to context
@@ -556,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_revalidate_inode(server, inode);
 }
 
-static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
 {
 	struct nfs_open_context *ctx;
 
@@ -566,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
-		ctx->lockowner = current->files;
+		ctx->mode = f_mode;
 		ctx->flags = 0;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
-		atomic_set(&ctx->count, 1);
+		nfs_init_lock_context(&ctx->lock_context);
+		ctx->lock_context.open_context = ctx;
+		INIT_LIST_HEAD(&ctx->list);
 	}
 	return ctx;
 }
@@ -578,7 +644,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->count);
+		atomic_inc(&ctx->lock_context.count);
 	return ctx;
 }
 
@@ -586,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+	if (!list_empty(&ctx->list)) {
+		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+			return;
+		list_del(&ctx->list);
+		spin_unlock(&inode->i_lock);
+	} else if (!atomic_dec_and_test(&ctx->lock_context.count))
 		return;
-	list_del(&ctx->list);
-	spin_unlock(&inode->i_lock);
-	NFS_PROTO(inode)->close_context(ctx, is_sync);
+	if (inode != NULL)
+		NFS_PROTO(inode)->close_context(ctx, is_sync);
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	path_put(&ctx->path);
@@ -606,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  */
-static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -663,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(&filp->f_path, cred);
+	ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
-	ctx->mode = filp->f_mode;
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
 	nfs_fscache_set_inode_cookie(inode, filp);
@@ -1338,8 +1407,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  * to open() calls that passed nfs_atomic_lookup, but failed to call
  * nfs_open().
  */
-void nfs4_clear_inode(struct inode *inode)
+void nfs4_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	pnfs_destroy_layout(NFS_I(inode));
 	/* If we are holding a delegation, return it! */
 	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
@@ -1377,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 	nfsi->delegation = NULL;
 	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
+	nfsi->layout = NULL;
 #endif
 }
 
@@ -1424,7 +1497,7 @@ static int nfsiod_start(void)
 {
 	struct workqueue_struct *wq;
 	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = create_singlethread_workqueue("nfsiod");
+	wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
 	if (wq == NULL)
 		return -ENOMEM;
 	nfsiod_workqueue = wq;
@@ -1452,6 +1525,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_idmap_init();
+	if (err < 0)
+		goto out9;
+
 	err = nfs_dns_resolver_init();
 	if (err < 0)
 		goto out8;
@@ -1516,6 +1593,8 @@ out6:
 out7:
 	nfs_dns_resolver_destroy();
 out8:
+	nfs_idmap_quit();
+out9:
 	return err;
 }
 
@@ -1528,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
 	nfs_dns_resolver_destroy();
+	nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e70f44b9b3f..db08ff3ff45 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
 #define NFS_UNSPEC_PORT		(-1)
 
 /*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
+/*
  * In-kernel mount arguments
  */
 struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
@@ -213,9 +219,9 @@ extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
-extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
-extern void nfs4_clear_inode(struct inode *);
+extern void nfs4_evict_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
@@ -370,10 +376,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
  * Helper for restarting RPC calls in the possible presence of NFSv4.1
  * sessions.
  */
-static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
 {
 	if (nfs4_has_session(clp))
-		rpc_restart_call_prepare(task);
-	else
-		rpc_restart_call(task);
+		return rpc_restart_call_prepare(task);
+	return rpc_restart_call(task);
 }
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d7..eceafe74f47 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
 		.rpc_resp	= &result,
 	};
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= info->protocol,
 		.address	= info->sap,
 		.addrsize	= info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
 		.to_retries = 2,
 	};
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= IPPROTO_UDP,
 		.address	= info->sap,
 		.addrsize	= info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
 
 	for (i = 0; i < entries; i++) {
 		flavors[i] = ntohl(*p++);
-		dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
+		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
 	}
 	*count = i;
 
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf1425791..e6bf45710cc 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 offset = (u32)args->offset;
 	u32 count = args->count;
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 static int
 nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
+	p = xdr_encode_fhandle(p, args->old_dir);
+	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+	p = xdr_encode_fhandle(p, args->new_dir);
+	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
 	return 0;
 }
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
 static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
-	struct rpc_task	*task = req->rq_task;
-	struct rpc_auth	*auth = task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -424,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	struct page **page;
 	size_t hdrlen;
 	unsigned int pglen, recvd;
-	u32 len;
 	int status, nr = 0;
-	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
 		return nfs_stat_to_errno(status);
@@ -446,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = kmap_atomic(*page, KM_USER0);
-	end = (__be32 *)((char *)p + pglen);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		if (p + 2 > end)
-			goto short_pkt;
-		p++; /* fileid */
-		len = ntohl(*p++);
-		p += XDR_QUADLEN(len) + 1;	/* name plus cookie */
-		if (len > NFS2_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-						len);
-			goto err_unmap;
-		}
-		if (p + 2 > end)
-			goto short_pkt;
-		entry = p;
-	}
-
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
- out:
-	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
- short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	entry[0] = entry[1] = 0;
-	if (!nr)
-		nr = -errno_NFSERR_IO;
-	goto out;
-err_unmap:
-	nr = -errno_NFSERR_IO;
-	goto out;
+}
+
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
 }
 
 __be32 *
-nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
-	if (!*p++) {
-		if (!*p)
+	__be32 *p;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
 	entry->ino	  = ntohl(*p++);
 	entry->len	  = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name	  = (const char *) p;
 	p		 += XDR_QUADLEN(entry->len);
 	entry->prev_cookie	  = entry->cookie;
 	entry->cookie	  = ntohl(*p++);
-	entry->eof	  = !p[0] && p[1];
+
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
 
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
 }
 
 /*
@@ -575,7 +551,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -597,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	struct kvec *iov = rcvbuf->head;
 	size_t hdrlen;
 	u32 len, recvd;
-	char	*kaddr;
 	int	status;
 
 	if ((status = ntohl(*p++)))
@@ -624,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 		return -EIO;
 	}
 
-	/* NULL terminate the string we got */
-	kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a..ce939c062a5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
  */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-		 int flags, struct nameidata *nd)
+		 int flags, struct nfs_open_context *ctx)
 {
 	struct nfs3_createdata *data;
 	mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		      struct inode *new_dir)
+{
+	struct nfs_renameres *res;
+
+	if (nfs3_async_handle_jukebox(task, old_dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
 static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		 struct inode *new_dir, struct qstr *new_name)
 {
-	struct nfs3_renameargs	arg = {
-		.fromfh		= NFS_FH(old_dir),
-		.fromname	= old_name->name,
-		.fromlen	= old_name->len,
-		.tofh		= NFS_FH(new_dir),
-		.toname		= new_name->name,
-		.tolen		= new_name->len
+	struct nfs_renameargs	arg = {
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
 	};
-	struct nfs3_renameres res;
+	struct nfs_renameres res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME],
 		.rpc_argp	= &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 
 	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
 
-	res.fromattr = nfs_alloc_fattr();
-	res.toattr = nfs_alloc_fattr();
-	if (res.fromattr == NULL || res.toattr == NULL)
+	res.old_fattr = nfs_alloc_fattr();
+	res.new_fattr = nfs_alloc_fattr();
+	if (res.old_fattr == NULL || res.new_fattr == NULL)
 		goto out;
 
 	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-	nfs_post_op_update_inode(old_dir, res.fromattr);
-	nfs_post_op_update_inode(new_dir, res.toattr);
+	nfs_post_op_update_inode(old_dir, res.old_fattr);
+	nfs_post_op_update_inode(new_dir, res.new_fattr);
 out:
-	nfs_free_fattr(res.toattr);
-	nfs_free_fattr(res.fromattr);
+	nfs_free_fattr(res.old_fattr);
+	nfs_free_fattr(res.new_fattr);
 	dprintk("NFS reply rename: %d\n", status);
 	return status;
 }
@@ -611,7 +630,7 @@ out:
  */
 static int
 nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-		  u64 cookie, struct page *page, unsigned int count, int plus)
+		  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	__be32			*verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.verf		= {verf[0], verf[1]},
 		.plus		= plus,
 		.count		= count,
-		.pages		= &page
+		.pages		= pages
 	};
 	struct nfs3_readdirres	res = {
 		.verf		= verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir: %d\n", status);
+	dprintk("NFS reply readdir%s: %d\n",
+			plus? "plus" : "", status);
 	return status;
 }
 
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 	dprintk("NFS call  fsstat\n");
 	nfs_fattr_init(stat->fattr);
 	status = rpc_call_sync(server->client, &msg, 0);
-	dprintk("NFS reply statfs: %d\n", status);
+	dprintk("NFS reply fsstat: %d\n", status);
 	return status;
 }
 
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.unlink_setup	= nfs3_proc_unlink_setup,
 	.unlink_done	= nfs3_proc_unlink_done,
 	.rename		= nfs3_proc_rename,
+	.rename_setup	= nfs3_proc_rename_setup,
+	.rename_done	= nfs3_proc_rename_done,
 	.link		= nfs3_proc_link,
 	.symlink	= nfs3_proc_symlink,
 	.mkdir		= nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da36..d9a5e832c25 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
 	[NF3FIFO] = S_IFIFO,
 };
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 /*
  * Common NFS XDR functions as inlines
  */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 	return NULL;
 }
 
+static inline __be32 *
+xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+	__be32 *p;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	fh->size = ntohl(*p++);
+
+	if (fh->size <= NFS3_FHSIZE) {
+		p = xdr_inline_decode(xdr, fh->size);
+		if (unlikely(!p))
+			goto out_overflow;
+		memcpy(fh->data, p, fh->size);
+		return p + XDR_QUADLEN(fh->size);
+	}
+	return NULL;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
+}
+
 /*
  * Encode/decode time.
  */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 }
 
 static inline __be32 *
+xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 84);
+		if (unlikely(!p))
+			goto out_overflow;
+		p = xdr_decode_fattr(p, fattr);
+	}
+	return p;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
+}
+
+static inline __be32 *
 xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
 	if (*p++)
@@ -330,7 +380,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
  * Encode RENAME arguments
  */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-	p = xdr_encode_fhandle(p, args->fromfh);
-	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_fhandle(p, args->tofh);
-	p = xdr_encode_array(p, args->toname, args->tolen);
+	p = xdr_encode_fhandle(p, args->old_dir);
+	p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
+	p = xdr_encode_fhandle(p, args->new_dir);
+	p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
 	return 0;
 }
@@ -471,7 +521,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	struct kvec *iov = rcvbuf->head;
 	struct page **page;
 	size_t hdrlen;
-	u32 len, recvd, pglen;
+	u32 recvd, pglen;
 	int status, nr = 0;
-	__be32 *entry, *end, *kaddr;
 
 	status = ntohl(*p++);
 	/* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = kmap_atomic(*page, KM_USER0);
-	end = (__be32 *)((char *)p + pglen);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		if (p + 3 > end)
-			goto short_pkt;
-		p += 2;				/* inode # */
-		len = ntohl(*p++);		/* string length */
-		p += XDR_QUADLEN(len) + 2;	/* name + cookie */
-		if (len > NFS3_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-						len);
-			goto err_unmap;
-		}
 
-		if (res->plus) {
-			/* post_op_attr */
-			if (p + 2 > end)
-				goto short_pkt;
-			if (*p++) {
-				p += 21;
-				if (p + 1 > end)
-					goto short_pkt;
-			}
-			/* post_op_fh3 */
-			if (*p++) {
-				if (p + 1 > end)
-					goto short_pkt;
-				len = ntohl(*p++);
-				if (len > NFS3_FHSIZE) {
-					dprintk("NFS: giant filehandle in "
-						"readdir (len 0x%x)!\n", len);
-					goto err_unmap;
-				}
-				p += XDR_QUADLEN(len);
-			}
-		}
-
-		if (p + 2 > end)
-			goto short_pkt;
-		entry = p;
-	}
-
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
- out:
-	kunmap_atomic(kaddr, KM_USER0);
 	return nr;
- short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	entry[0] = entry[1] = 0;
-	if (!nr)
-		nr = -errno_NFSERR_IO;
-	goto out;
-err_unmap:
-	nr = -errno_NFSERR_IO;
-	goto out;
 }
 
 __be32 *
-nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
+	__be32 *p;
 	struct nfs_entry old = *entry;
 
-	if (!*p++) {
-		if (!*p)
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &entry->ino);
 	entry->len  = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len + 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name = (const char *) p;
 	p += XDR_QUADLEN(entry->len);
 	entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 
 	if (plus) {
 		entry->fattr->valid = 0;
-		p = xdr_decode_post_op_attr(p, entry->fattr);
+		p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
+		if (IS_ERR(p))
+			goto out_overflow_exit;
 		/* In fact, a post_op_fh3: */
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (*p++) {
-			p = xdr_decode_fhandle(p, entry->fh);
+			p = xdr_decode_fhandle_stream(xdr, entry->fh);
+			if (IS_ERR(p))
+				goto out_overflow_exit;
 			/* Ugh -- server reply was truncated */
 			if (p == NULL) {
 				dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 			memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
 	}
 
-	entry->eof = !p[0] && p[1];
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
+
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+out_overflow_exit:
+	return ERR_PTR(-EIO);
 }
 
 /*
@@ -675,7 +680,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 		    struct nfs3_getaclargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +807,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	struct kvec *iov = rcvbuf->head;
 	size_t hdrlen;
 	u32 len, recvd;
-	char	*kaddr;
 	int	status;
 
 	status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 		return -EIO;
 	}
 
-	/* NULL terminate the string we got */
-	kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 }
 
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
  * Decode RENAME reply
  */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
 	int	status;
 
 	if ((status = ntohl(*p++)) != 0)
 		status = nfs_stat_to_errno(status);
-	p = xdr_decode_wcc_data(p, res->fromattr);
-	p = xdr_decode_wcc_data(p, res->toattr);
+	p = xdr_decode_wcc_data(p, res->old_fattr);
+	p = xdr_decode_wcc_data(p, res->new_fattr);
 	return status;
 }
 
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 	res->wtmult = ntohl(*p++);
 	res->dtpref = ntohl(*p++);
 	p = xdr_decode_hyper(p, &res->maxfilesize);
+	p = xdr_decode_time3(p, &res->time_delta);
 
-	/* ignore time_delta and properties */
+	/* ignore properties */
 	res->lease_time = 0;
 	return 0;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c538c6106e1..9fa496387fd 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
 	NFS4CLNT_RECLAIM_NOGRACE,
 	NFS4CLNT_DELEGRETURN,
 	NFS4CLNT_SESSION_RESET,
-	NFS4CLNT_SESSION_DRAINING,
 	NFS4CLNT_RECALL_SLOT,
 };
 
+enum nfs4_session_state {
+	NFS4_SESSION_INITING,
+	NFS4_SESSION_DRAINING,
+};
+
+struct nfs4_minor_version_ops {
+	u32	minor_version;
+
+	int	(*call_sync)(struct nfs_server *server,
+			struct rpc_message *msg,
+			struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res,
+			int cache_reply);
+	int	(*validate_stateid)(struct nfs_delegation *,
+			const nfs4_stateid *);
+	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+};
+
 /*
  * struct rpc_sequence ensures that RPC calls are sent in the exact
  * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
  */
 struct nfs4_state_owner {
 	struct nfs_unique_id so_owner_id;
-	struct nfs_client    *so_client;
 	struct nfs_server    *so_server;
 	struct rb_node	     so_client_node;
 
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
 	atomic_t	     so_count;
 	unsigned long	     so_flags;
 	struct list_head     so_states;
-	struct list_head     so_delegations;
 	struct nfs_seqid_counter so_seqid;
 	struct rpc_sequence  so_sequence;
 };
@@ -125,10 +142,20 @@ enum {
  * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
  */
 
+struct nfs4_lock_owner {
+	unsigned int lo_type;
+#define NFS4_ANY_LOCK_TYPE	(0U)
+#define NFS4_FLOCK_LOCK_TYPE	(1U << 0)
+#define NFS4_POSIX_LOCK_TYPE	(1U << 1)
+	union {
+		fl_owner_t posix_owner;
+		pid_t flock_owner;
+	} lo_u;
+};
+
 struct nfs4_lock_state {
 	struct list_head	ls_locks;	/* Other lock stateids */
 	struct nfs4_state *	ls_state;	/* Pointer to open state */
-	fl_owner_t		ls_owner;	/* POSIX lock owner */
 #define NFS_LOCK_INITIALIZED 1
 	int			ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
 	struct nfs_unique_id	ls_id;
 	nfs4_stateid		ls_stateid;
 	atomic_t		ls_count;
+	struct nfs4_lock_owner	ls_owner;
 };
 
 /* bits for nfs4_state->flags */
@@ -214,16 +242,18 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
 
-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
 #if defined(CONFIG_NFS_V4_1)
-extern int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return server->nfs_client->cl_session;
+}
+
+extern int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +264,12 @@ extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
 #else /* CONFIG_NFS_v4_1 */
-static inline int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+	return NULL;
+}
+
+static inline int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		int cache_reply, struct rpc_task *task)
 {
@@ -247,7 +282,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -284,7 +319,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -296,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 00000000000..2e92f0d8d65
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+						nfs4_fl_free_deviceid_callback);
+	if (status) {
+		printk(KERN_WARNING "%s: deviceid cache could not be "
+			"initialized\n", __func__);
+		return status;
+	}
+	dprintk("%s: deviceid cache has been initialized successfully\n",
+		__func__);
+	return 0;
+}
+
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+	dprintk("--> %s\n", __func__);
+
+	if (nfss->nfs_client->cl_devid_cache)
+		pnfs_put_deviceid_cache(nfss->nfs_client);
+	return 0;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+			struct nfs4_filelayout_segment *fl,
+			struct nfs4_layoutget_res *lgr,
+			struct nfs4_deviceid *id)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	int status = -EINVAL;
+	struct nfs_server *nfss = NFS_SERVER(lo->inode);
+
+	dprintk("--> %s\n", __func__);
+
+	if (fl->pattern_offset > lgr->range.offset) {
+		dprintk("%s pattern_offset %lld to large\n",
+				__func__, fl->pattern_offset);
+		goto out;
+	}
+
+	if (fl->stripe_unit % PAGE_SIZE) {
+		dprintk("%s Stripe unit (%u) not page aligned\n",
+			__func__, fl->stripe_unit);
+		goto out;
+	}
+
+	/* find and reference the deviceid */
+	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+	if (dsaddr == NULL) {
+		dsaddr = get_device_info(lo->inode, id);
+		if (dsaddr == NULL)
+			goto out;
+	}
+	fl->dsaddr = dsaddr;
+
+	if (fl->first_stripe_index < 0 ||
+	    fl->first_stripe_index >= dsaddr->stripe_count) {
+		dprintk("%s Bad first_stripe_index %d\n",
+				__func__, fl->first_stripe_index);
+		goto out_put;
+	}
+
+	if ((fl->stripe_type == STRIPE_SPARSE &&
+	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+	    (fl->stripe_type == STRIPE_DENSE &&
+	    fl->num_fh != dsaddr->stripe_count)) {
+		dprintk("%s num_fh %u not valid for given packing\n",
+			__func__, fl->num_fh);
+		goto out_put;
+	}
+
+	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+		dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+			nfss->wsize);
+	}
+
+	status = 0;
+out:
+	dprintk("--> %s returns %d\n", __func__, status);
+	return status;
+out_put:
+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+	goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+	int i;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		if (!fl->fh_array[i])
+			break;
+		kfree(fl->fh_array[i]);
+	}
+	kfree(fl->fh_array);
+	fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+	filelayout_free_fh_array(fl);
+	kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+			 struct nfs4_filelayout_segment *fl,
+			 struct nfs4_layoutget_res *lgr,
+			 struct nfs4_deviceid *id)
+{
+	uint32_t *p = (uint32_t *)lgr->layout.buf;
+	uint32_t nfl_util;
+	int i;
+
+	dprintk("%s: set_layout_map Begin\n", __func__);
+
+	memcpy(id, p, sizeof(*id));
+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+	print_deviceid(id);
+
+	nfl_util = be32_to_cpup(p++);
+	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+		fl->commit_through_mds = 1;
+	if (nfl_util & NFL4_UFLG_DENSE)
+		fl->stripe_type = STRIPE_DENSE;
+	else
+		fl->stripe_type = STRIPE_SPARSE;
+	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+	fl->first_stripe_index = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &fl->pattern_offset);
+	fl->num_fh = be32_to_cpup(p++);
+
+	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+		__func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+		fl->pattern_offset);
+
+	fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+			       GFP_KERNEL);
+	if (!fl->fh_array)
+		return -ENOMEM;
+
+	for (i = 0; i < fl->num_fh; i++) {
+		/* Do we want to use a mempool here? */
+		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+		if (!fl->fh_array[i]) {
+			filelayout_free_fh_array(fl);
+			return -ENOMEM;
+		}
+		fl->fh_array[i]->size = be32_to_cpup(p++);
+		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+			printk(KERN_ERR "Too big fh %d received %d\n",
+			       i, fl->fh_array[i]->size);
+			filelayout_free_fh_array(fl);
+			return -EIO;
+		}
+		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+		p += XDR_QUADLEN(fl->fh_array[i]->size);
+		dprintk("DEBUG: %s: fh len %d\n", __func__,
+			fl->fh_array[i]->size);
+	}
+
+	return 0;
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+		      struct nfs4_layoutget_res *lgr)
+{
+	struct nfs4_filelayout_segment *fl;
+	int rc;
+	struct nfs4_deviceid id;
+
+	dprintk("--> %s\n", __func__);
+	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+	if (!fl)
+		return NULL;
+
+	rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+		_filelayout_free_lseg(fl);
+		return NULL;
+	}
+	return &fl->generic_hdr;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+	dprintk("--> %s\n", __func__);
+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+			  &fl->dsaddr->deviceid);
+	_filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+	.id = LAYOUT_NFSV4_1_FILES,
+	.name = "LAYOUT_NFSV4_1_FILES",
+	.owner = THIS_MODULE,
+	.set_layoutdriver = filelayout_set_layoutdriver,
+	.clear_layoutdriver = filelayout_clear_layoutdriver,
+	.alloc_lseg              = filelayout_alloc_lseg,
+	.free_lseg               = filelayout_free_lseg,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+	       __func__);
+	return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+	       __func__);
+	pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 00000000000..bbf60dd2ab9
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "pnfs.h"
+
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+	STRIPE_SPARSE = 1,
+	STRIPE_DENSE = 2
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+	u32			ds_ip_addr;
+	u32			ds_port;
+	struct nfs_client	*ds_clp;
+	atomic_t		ds_count;
+};
+
+struct nfs4_file_layout_dsaddr {
+	struct pnfs_deviceid_node	deviceid;
+	u32				stripe_count;
+	u8				*stripe_indices;
+	u32				ds_num;
+	struct nfs4_pnfs_ds		*ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+	struct pnfs_layout_segment generic_hdr;
+	u32 stripe_type;
+	u32 commit_through_mds;
+	u32 stripe_unit;
+	u32 first_stripe_index;
+	u64 pattern_offset;
+	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+	unsigned int num_fh;
+	struct nfs_fh **fh_array;
+};
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg,
+			    struct nfs4_filelayout_segment,
+			    generic_hdr);
+}
+
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 00000000000..51fe64ace55
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+	if (ds == NULL) {
+		printk("%s NULL device\n", __func__);
+		return;
+	}
+	printk("        ip_addr %x port %hu\n"
+		"        ref count %d\n"
+		"        client %p\n"
+		"        cl_exchange_flags %x\n",
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		atomic_read(&ds->ds_count), ds->ds_clp,
+		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	int i;
+
+	ifdebug(FACILITY) {
+		printk("%s dsaddr->ds_num %d\n", __func__,
+		       dsaddr->ds_num);
+		for (i = 0; i < dsaddr->ds_num; i++)
+			print_ds(dsaddr->ds_list[i]);
+	}
+}
+
+void print_deviceid(struct nfs4_deviceid *id)
+{
+	u32 *p = (u32 *)id;
+
+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+}
+
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+	struct nfs4_pnfs_ds *ds;
+
+	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+			ntohl(ip_addr), ntohs(port));
+
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+		if (ds->ds_ip_addr == ip_addr &&
+		    ds->ds_port == port) {
+			return ds;
+		}
+	}
+	return NULL;
+}
+
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+	dprintk("--> %s\n", __func__);
+	ifdebug(FACILITY)
+		print_ds(ds);
+
+	if (ds->ds_clp)
+		nfs_put_client(ds->ds_clp);
+	kfree(ds);
+}
+
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+	struct nfs4_pnfs_ds *ds;
+	int i;
+
+	print_deviceid(&dsaddr->deviceid.de_id);
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		ds = dsaddr->ds_list[i];
+		if (ds != NULL) {
+			if (atomic_dec_and_lock(&ds->ds_count,
+						&nfs4_ds_cache_lock)) {
+				list_del_init(&ds->ds_node);
+				spin_unlock(&nfs4_ds_cache_lock);
+				destroy_ds(ds);
+			}
+		}
+	}
+	kfree(dsaddr->stripe_indices);
+	kfree(dsaddr);
+}
+
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr =
+		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+
+	nfs4_fl_free_deviceid(dsaddr);
+}
+
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+	struct nfs4_pnfs_ds *tmp_ds, *ds;
+
+	ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+	if (!ds)
+		goto out;
+
+	spin_lock(&nfs4_ds_cache_lock);
+	tmp_ds = _data_server_lookup_locked(ip_addr, port);
+	if (tmp_ds == NULL) {
+		ds->ds_ip_addr = ip_addr;
+		ds->ds_port = port;
+		atomic_set(&ds->ds_count, 1);
+		INIT_LIST_HEAD(&ds->ds_node);
+		ds->ds_clp = NULL;
+		list_add(&ds->ds_node, &nfs4_data_server_cache);
+		dprintk("%s add new data server ip 0x%x\n", __func__,
+			ds->ds_ip_addr);
+	} else {
+		kfree(ds);
+		atomic_inc(&tmp_ds->ds_count);
+		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+			__func__, tmp_ds->ds_ip_addr,
+			atomic_read(&tmp_ds->ds_count));
+		ds = tmp_ds;
+	}
+	spin_unlock(&nfs4_ds_cache_lock);
+out:
+	return ds;
+}
+
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+	struct nfs4_pnfs_ds *ds = NULL;
+	char *buf;
+	const char *ipend, *pstr;
+	u32 ip_addr, port;
+	int nlen, rlen, i;
+	int tmp[2];
+	__be32 *r_netid, *r_addr, *p = *pp;
+
+	/* r_netid */
+	nlen = be32_to_cpup(p++);
+	r_netid = p;
+	p += XDR_QUADLEN(nlen);
+
+	/* r_addr */
+	rlen = be32_to_cpup(p++);
+	r_addr = p;
+	p += XDR_QUADLEN(rlen);
+	*pp = p;
+
+	/* Check that netid is "tcp" */
+	if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+		goto out_err;
+	}
+
+	/* ipv6 length plus port is legal */
+	if (rlen > INET6_ADDRSTRLEN + 8) {
+		dprintk("%s Invalid address, length %d\n", __func__,
+			rlen);
+		goto out_err;
+	}
+	buf = kmalloc(rlen + 1, GFP_KERNEL);
+	buf[rlen] = '\0';
+	memcpy(buf, r_addr, rlen);
+
+	/* replace the port dots with dashes for the in4_pton() delimiter*/
+	for (i = 0; i < 2; i++) {
+		char *res = strrchr(buf, '.');
+		*res = '-';
+	}
+
+	/* Currently only support ipv4 address */
+	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+		dprintk("%s: Only ipv4 addresses supported\n", __func__);
+		goto out_free;
+	}
+
+	/* port */
+	pstr = ipend;
+	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+	port = htons((tmp[0] << 8) | (tmp[1]));
+
+	ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+	dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+	kfree(buf);
+out_err:
+	return ds;
+}
+
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+	int i, dummy;
+	u32 cnt, num;
+	u8 *indexp;
+	__be32 *p = (__be32 *)pdev->area, *indicesp;
+	struct nfs4_file_layout_dsaddr *dsaddr;
+
+	/* Get the stripe count (number of stripe index) */
+	cnt = be32_to_cpup(p++);
+	dprintk("%s stripe count  %d\n", __func__, cnt);
+	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+		printk(KERN_WARNING "%s: stripe count %d greater than "
+		       "supported maximum %d\n", __func__,
+			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+		goto out_err;
+	}
+
+	/* Check the multipath list count */
+	indicesp = p;
+	p += XDR_QUADLEN(cnt << 2);
+	num = be32_to_cpup(p++);
+	dprintk("%s ds_num %u\n", __func__, num);
+	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+		printk(KERN_WARNING "%s: multipath count %d greater than "
+			"supported maximum %d\n", __func__,
+			num, NFS4_PNFS_MAX_MULTI_CNT);
+		goto out_err;
+	}
+	dsaddr = kzalloc(sizeof(*dsaddr) +
+			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+			GFP_KERNEL);
+	if (!dsaddr)
+		goto out_err;
+
+	dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+	if (!dsaddr->stripe_indices)
+		goto out_err_free;
+
+	dsaddr->stripe_count = cnt;
+	dsaddr->ds_num = num;
+
+	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+
+	/* Go back an read stripe indices */
+	p = indicesp;
+	indexp = &dsaddr->stripe_indices[0];
+	for (i = 0; i < dsaddr->stripe_count; i++) {
+		*indexp = be32_to_cpup(p++);
+		if (*indexp >= num)
+			goto out_err_free;
+		indexp++;
+	}
+	/* Skip already read multipath list count */
+	p++;
+
+	for (i = 0; i < dsaddr->ds_num; i++) {
+		int j;
+
+		dummy = be32_to_cpup(p++); /* multipath count */
+		if (dummy > 1) {
+			printk(KERN_WARNING
+			       "%s: Multipath count %d not supported, "
+			       "skipping all greater than 1\n", __func__,
+				dummy);
+		}
+		for (j = 0; j < dummy; j++) {
+			if (j == 0) {
+				dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+				if (dsaddr->ds_list[i] == NULL)
+					goto out_err_free;
+			} else {
+				u32 len;
+				/* skip extra multipath */
+				len = be32_to_cpup(p++);
+				p += XDR_QUADLEN(len);
+				len = be32_to_cpup(p++);
+				p += XDR_QUADLEN(len);
+				continue;
+			}
+		}
+	}
+	return dsaddr;
+
+out_err_free:
+	nfs4_fl_free_deviceid(dsaddr);
+out_err:
+	dprintk("%s ERROR: returning NULL\n", __func__);
+	return NULL;
+}
+
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr;
+	struct pnfs_deviceid_node *d;
+
+	dsaddr = decode_device(inode, dev);
+	if (!dsaddr) {
+		printk(KERN_WARNING "%s: Could not decode or add device\n",
+			__func__);
+		return NULL;
+	}
+
+	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+			      &dsaddr->deviceid);
+
+	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+	struct pnfs_device *pdev = NULL;
+	u32 max_resp_sz;
+	int max_pages;
+	struct page **pages = NULL;
+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+	int rc, i;
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	/*
+	 * Use the session max response size as the basis for setting
+	 * GETDEVICEINFO's maxcount
+	 */
+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+	max_pages = max_resp_sz >> PAGE_SHIFT;
+	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+		__func__, inode, max_resp_sz, max_pages);
+
+	pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+	if (pdev == NULL)
+		return NULL;
+
+	pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		kfree(pdev);
+		return NULL;
+	}
+	for (i = 0; i < max_pages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free;
+	}
+
+	/* set pdev->area */
+	pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+	if (!pdev->area)
+		goto out_free;
+
+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+	pdev->pages = pages;
+	pdev->pgbase = 0;
+	pdev->pglen = PAGE_SIZE * max_pages;
+	pdev->mincount = 0;
+
+	rc = nfs4_proc_getdeviceinfo(server, pdev);
+	dprintk("%s getdevice info returns %d\n", __func__, rc);
+	if (rc)
+		goto out_free;
+
+	/*
+	 * Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+	if (pdev->area != NULL)
+		vunmap(pdev->area);
+	for (i = 0; i < max_pages; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+	kfree(pdev);
+	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+	return dsaddr;
+}
+
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+	struct pnfs_deviceid_node *d;
+
+	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+	return (d == NULL) ? NULL :
+		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 70015dd60a9..0f24cdf2cb1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
 			| FATTR4_WORD0_MAXREAD
 			| FATTR4_WORD0_MAXWRITE
 			| FATTR4_WORD0_LEASE_TIME,
-			0
+			FATTR4_WORD1_TIME_DELTA
+			| FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			nfs4_state_mark_reclaim_nograce(clp, state);
 			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
-			if (state == NULL)
-				break;
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
 			goto do_state_recovery;
@@ -303,15 +302,19 @@ do_state_recovery:
 }
 
 
-static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
 {
-	struct nfs_client *clp = server->nfs_client;
 	spin_lock(&clp->cl_lock);
 	if (time_before(clp->cl_last_renewal,timestamp))
 		clp->cl_last_renewal = timestamp;
 	spin_unlock(&clp->cl_lock);
 }
 
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+	do_renew_lease(server->nfs_client, timestamp);
+}
+
 #if defined(CONFIG_NFS_V4_1)
 
 /*
@@ -330,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * Must be called while holding tbl->slot_tbl_lock
  */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 {
+	int free_slotid = free_slot - tbl->slots;
 	int slotid = free_slotid;
 
+	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
 	/* clear used bit in bitmap */
 	__clear_bit(slotid, tbl->used_slots);
 
@@ -356,7 +361,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 {
 	struct rpc_task *task;
 
-	if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
+	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
 		task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
 		if (task)
 			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -370,13 +375,12 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 	complete(&ses->complete);
 }
 
-static void nfs41_sequence_free_slot(const struct nfs_client *clp,
-			      struct nfs4_sequence_res *res)
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 {
 	struct nfs4_slot_table *tbl;
 
-	tbl = &clp->cl_session->fc_slot_table;
-	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+	tbl = &res->sr_session->fc_slot_table;
+	if (!res->sr_slot) {
 		/* just wake up the next guy waiting since
 		 * we may have not consumed a slot after all */
 		dprintk("%s: No slot\n", __func__);
@@ -384,19 +388,16 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
 	}
 
 	spin_lock(&tbl->slot_tbl_lock);
-	nfs4_free_slot(tbl, res->sr_slotid);
-	nfs41_check_drain_session_complete(clp->cl_session);
+	nfs4_free_slot(tbl, res->sr_slot);
+	nfs41_check_drain_session_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	res->sr_slot = NULL;
 }
 
-static void nfs41_sequence_done(struct nfs_client *clp,
-				struct nfs4_sequence_res *res,
-				int rpc_status)
+static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
 	unsigned long timestamp;
-	struct nfs4_slot_table *tbl;
-	struct nfs4_slot *slot;
+	struct nfs_client *clp;
 
 	/*
 	 * sr_status remains 1 if an RPC level error occurred. The server
@@ -408,28 +409,53 @@ static void nfs41_sequence_done(struct nfs_client *clp,
 		res->sr_status = NFS_OK;
 
 	/* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
-	if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+	if (!res->sr_slot)
 		goto out;
 
 	/* Check the SEQUENCE operation status */
-	if (res->sr_status == 0) {
-		tbl = &clp->cl_session->fc_slot_table;
-		slot = tbl->slots + res->sr_slotid;
+	switch (res->sr_status) {
+	case 0:
 		/* Update the slot's sequence and clientid lease timer */
-		++slot->seq_nr;
+		++res->sr_slot->seq_nr;
 		timestamp = res->sr_renewal_time;
-		spin_lock(&clp->cl_lock);
-		if (time_before(clp->cl_last_renewal, timestamp))
-			clp->cl_last_renewal = timestamp;
-		spin_unlock(&clp->cl_lock);
+		clp = res->sr_session->clp;
+		do_renew_lease(clp, timestamp);
 		/* Check sequence flags */
 		if (atomic_read(&clp->cl_count) > 1)
 			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		break;
+	case -NFS4ERR_DELAY:
+		/* The server detected a resend of the RPC call and
+		 * returned NFS4ERR_DELAY as per Section 2.10.6.2
+		 * of RFC5661.
+		 */
+		dprintk("%s: slot=%td seq=%d: Operation in progress\n",
+			__func__,
+			res->sr_slot - res->sr_session->fc_slot_table.slots,
+			res->sr_slot->seq_nr);
+		goto out_retry;
+	default:
+		/* Just update the slot sequence no. */
+		++res->sr_slot->seq_nr;
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
 	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
-	nfs41_sequence_free_slot(clp, res);
+	nfs41_sequence_free_slot(res);
+	return 1;
+out_retry:
+	if (!rpc_restart_call(task))
+		goto out;
+	rpc_delay(task, NFS4_POLL_RETRY_MAX);
+	return 0;
+}
+
+static int nfs4_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	if (res->sr_session == NULL)
+		return 1;
+	return nfs41_sequence_done(task, res);
 }
 
 /*
@@ -477,15 +503,13 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 
 	dprintk("--> %s\n", __func__);
 	/* slot already allocated? */
-	if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+	if (res->sr_slot != NULL)
 		return 0;
 
-	memset(res, 0, sizeof(*res));
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 	tbl = &session->fc_slot_table;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
+	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
 	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
 		/*
 		 * The state manager will wait until the slot table is empty.
@@ -523,8 +547,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
 
 	res->sr_session = session;
-	res->sr_slotid = slotid;
+	res->sr_slot = slot;
 	res->sr_renewal_time = jiffies;
+	res->sr_status_flags = 0;
 	/*
 	 * sr_status is only set in decode_sequence, and so will remain
 	 * set to 1 if an rpc level failure occurs.
@@ -533,33 +558,34 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	return 0;
 }
 
-int nfs4_setup_sequence(struct nfs_client *clp,
+int nfs4_setup_sequence(const struct nfs_server *server,
 			struct nfs4_sequence_args *args,
 			struct nfs4_sequence_res *res,
 			int cache_reply,
 			struct rpc_task *task)
 {
+	struct nfs4_session *session = nfs4_get_session(server);
 	int ret = 0;
 
-	dprintk("--> %s clp %p session %p sr_slotid %d\n",
-		__func__, clp, clp->cl_session, res->sr_slotid);
-
-	if (!nfs4_has_session(clp))
+	if (session == NULL) {
+		args->sa_session = NULL;
+		res->sr_session = NULL;
 		goto out;
-	ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
-				   task);
-	if (ret && ret != -EAGAIN) {
-		/* terminate rpc task */
-		task->tk_status = ret;
-		task->tk_action = NULL;
 	}
+
+	dprintk("--> %s clp %p session %p sr_slot %td\n",
+		__func__, session->clp, session, res->sr_slot ?
+			res->sr_slot - session->fc_slot_table.slots : -1);
+
+	ret = nfs41_setup_sequence(session, args, res, cache_reply,
+				   task);
 out:
 	dprintk("<-- %s status=%d\n", __func__, ret);
 	return ret;
 }
 
 struct nfs41_call_sync_data {
-	struct nfs_client *clp;
+	const struct nfs_server *seq_server;
 	struct nfs4_sequence_args *seq_args;
 	struct nfs4_sequence_res *seq_res;
 	int cache_reply;
@@ -569,9 +595,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs41_call_sync_data *data = calldata;
 
-	dprintk("--> %s data->clp->cl_session %p\n", __func__,
-		data->clp->cl_session);
-	if (nfs4_setup_sequence(data->clp, data->seq_args,
+	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+	if (nfs4_setup_sequence(data->seq_server, data->seq_args,
 				data->seq_res, data->cache_reply, task))
 		return;
 	rpc_call_start(task);
@@ -587,7 +613,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs41_call_sync_data *data = calldata;
 
-	nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
+	nfs41_sequence_done(task, data->seq_res);
 }
 
 struct rpc_call_ops nfs41_call_sync_ops = {
@@ -600,8 +626,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
 	.rpc_call_done = nfs41_call_sync_done,
 };
 
-static int nfs4_call_sync_sequence(struct nfs_client *clp,
-				   struct rpc_clnt *clnt,
+static int nfs4_call_sync_sequence(struct nfs_server *server,
 				   struct rpc_message *msg,
 				   struct nfs4_sequence_args *args,
 				   struct nfs4_sequence_res *res,
@@ -611,19 +636,19 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
 	int ret;
 	struct rpc_task *task;
 	struct nfs41_call_sync_data data = {
-		.clp = clp,
+		.seq_server = server,
 		.seq_args = args,
 		.seq_res = res,
 		.cache_reply = cache_reply,
 	};
 	struct rpc_task_setup task_setup = {
-		.rpc_client = clnt,
+		.rpc_client = server->client,
 		.rpc_message = msg,
 		.callback_ops = &nfs41_call_sync_ops,
 		.callback_data = &data
 	};
 
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+	res->sr_slot = NULL;
 	if (privileged)
 		task_setup.callback_ops = &nfs41_call_priv_sync_ops;
 	task = rpc_run_task(&task_setup);
@@ -642,10 +667,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
 			    struct nfs4_sequence_res *res,
 			    int cache_reply)
 {
-	return nfs4_call_sync_sequence(server->nfs_client, server->client,
-				       msg, args, res, cache_reply, 0);
+	return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
 }
 
+#else
+static int nfs4_sequence_done(struct rpc_task *task,
+			       struct nfs4_sequence_res *res)
+{
+	return 1;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 int _nfs4_call_sync(struct nfs_server *server,
@@ -659,18 +689,9 @@ int _nfs4_call_sync(struct nfs_server *server,
 }
 
 #define nfs4_call_sync(server, msg, args, res, cache_reply) \
-	(server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+	(server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
 			&(res)->seq_res, (cache_reply))
 
-static void nfs4_sequence_done(const struct nfs_server *server,
-			       struct nfs4_sequence_res *res, int rpc_status)
-{
-#ifdef CONFIG_NFS_V4_1
-	if (nfs4_has_session(server->nfs_client))
-		nfs41_sequence_done(server->nfs_client, res, rpc_status);
-#endif /* CONFIG_NFS_V4_1 */
-}
-
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
@@ -712,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
 	nfs_fattr_init(&p->dir_attr);
-	p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -745,19 +765,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-	if (flags & O_EXCL) {
-		if (nfs4_has_persistent_session(server->nfs_client)) {
-			/* GUARDED */
-			p->o_arg.u.attrs = &p->attrs;
-			memcpy(&p->attrs, attrs, sizeof(p->attrs));
-		} else { /* EXCLUSIVE4_1 */
-			u32 *s = (u32 *) p->o_arg.u.verifier.data;
-			s[0] = jiffies;
-			s[1] = current->pid;
-		}
-	} else if (flags & O_CREAT) {
+	if (flags & O_CREAT) {
+		u32 *s;
+
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		s = (u32 *) p->o_arg.u.verifier.data;
+		s[0] = jiffies;
+		s[1] = current->pid;
 	}
 	p->c_arg.fh = &p->o_res.fh;
 	p->c_arg.stateid = &p->o_res.stateid;
@@ -1102,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
 	smp_rmb();
 	if (state->n_rdwr != 0) {
+		clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
 		if (ret != 0)
 			return ret;
@@ -1109,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 			return -ESTALE;
 	}
 	if (state->n_wronly != 0) {
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
 		if (ret != 0)
 			return ret;
@@ -1116,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 			return -ESTALE;
 	}
 	if (state->n_rdonly != 0) {
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
 		if (ret != 0)
 			return ret;
@@ -1170,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	int err;
 	do {
 		err = _nfs4_do_open_reclaim(ctx, state);
-		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -1240,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
 				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
 			case -ENOMEM:
 				err = 0;
 				goto out;
@@ -1255,8 +1280,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 	struct nfs4_opendata *data = calldata;
 
 	data->rpc_status = task->tk_status;
-	if (RPC_ASSASSINATED(task))
-		return;
 	if (data->rpc_status == 0) {
 		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
 				sizeof(data->o_res.stateid.data));
@@ -1356,13 +1379,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	}
 	/* Update sequence id. */
 	data->o_arg.id = sp->so_owner_id.id;
-	data->o_arg.clientid = sp->so_client->cl_clientid;
+	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
 		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
 	}
 	data->timestamp = jiffies;
-	if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+	if (nfs4_setup_sequence(data->o_arg.server,
 				&data->o_arg.seq_args,
 				&data->o_res.seq_res, 1, task))
 		return;
@@ -1385,11 +1408,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
 
 	data->rpc_status = task->tk_status;
 
-	nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
-			task->tk_status);
-
-	if (RPC_ASSASSINATED(task))
+	if (!nfs4_sequence_done(task, &data->o_res.seq_res))
 		return;
+
 	if (task->tk_status == 0) {
 		switch (data->o_res.f_attr->mode & S_IFMT) {
 			case S_IFREG:
@@ -1591,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -1773,7 +1793,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
 		/* Use that stateid */
 	} else if (state != NULL) {
-		nfs4_copy_stateid(&arg.stateid, state, current->files);
+		nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
 	} else
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
 
@@ -1838,8 +1858,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	struct nfs4_state *state = calldata->state;
 	struct nfs_server *server = NFS_SERVER(calldata->inode);
 
-	nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
-	if (RPC_ASSASSINATED(task))
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
 		return;
         /* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
@@ -1903,7 +1922,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 
 	nfs_fattr_init(calldata->res.fattr);
 	calldata->timestamp = jiffies;
-	if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
+	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
 				&calldata->arg.seq_args, &calldata->res.seq_res,
 				1, task))
 		return;
@@ -1962,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
-	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	path_get(path);
 	calldata->path = *path;
 
@@ -1985,118 +2003,17 @@ out:
 	return status;
 }
 
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
-	struct file *filp;
-	int ret;
-
-	/* If the open_intent is for execute, we have an extra check to make */
-	if (fmode & FMODE_EXEC) {
-		ret = nfs_may_open(state->inode,
-				state->owner->so_cred,
-				nd->intent.open.flags);
-		if (ret < 0)
-			goto out_close;
-	}
-	filp = lookup_instantiate_filp(nd, path->dentry, NULL);
-	if (!IS_ERR(filp)) {
-		struct nfs_open_context *ctx;
-		ctx = nfs_file_open_context(filp);
-		ctx->state = state;
-		return 0;
-	}
-	ret = PTR_ERR(filp);
-out_close:
-	nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
-	return ret;
-}
-
-struct dentry *
-nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct path path = {
-		.mnt = nd->path.mnt,
-		.dentry = dentry,
-	};
-	struct dentry *parent;
-	struct iattr attr;
-	struct rpc_cred *cred;
 	struct nfs4_state *state;
-	struct dentry *res;
-	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
-
-	if (nd->flags & LOOKUP_CREATE) {
-		attr.ia_mode = nd->intent.open.create_mode;
-		attr.ia_valid = ATTR_MODE;
-		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current_umask();
-	} else {
-		attr.ia_valid = 0;
-		BUG_ON(nd->intent.open.flags & O_CREAT);
-	}
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return (struct dentry *)cred;
-	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
-	nfs_block_sillyrename(parent);
-	state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
-	put_rpccred(cred);
-	if (IS_ERR(state)) {
-		if (PTR_ERR(state) == -ENOENT) {
-			d_add(dentry, NULL);
-			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		}
-		nfs_unblock_sillyrename(parent);
-		return (struct dentry *)state;
-	}
-	res = d_add_unique(dentry, igrab(state->inode));
-	if (res != NULL)
-		path.dentry = res;
-	nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
-	nfs_unblock_sillyrename(parent);
-	nfs4_intent_set_file(nd, &path, state, fmode);
-	return res;
-}
-
-int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
-{
-	struct path path = {
-		.mnt = nd->path.mnt,
-		.dentry = dentry,
-	};
-	struct rpc_cred *cred;
-	struct nfs4_state *state;
-	fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
-
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-	state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
-	put_rpccred(cred);
-	if (IS_ERR(state)) {
-		switch (PTR_ERR(state)) {
-			case -EPERM:
-			case -EACCES:
-			case -EDQUOT:
-			case -ENOSPC:
-			case -EROFS:
-				return PTR_ERR(state);
-			default:
-				goto out_drop;
-		}
-	}
-	if (state->inode == dentry->d_inode) {
-		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		nfs4_intent_set_file(nd, &path, state, fmode);
-		return 1;
-	}
-	nfs4_close_sync(&path, state, fmode);
-out_drop:
-	d_drop(dentry);
-	return 0;
+	state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
+	if (IS_ERR(state))
+		return ERR_CAST(state);
+	ctx->state = state;
+	return igrab(state->inode);
 }
 
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2260,8 +2177,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 out:
 	if (page)
 		__free_page(page);
-	if (locations)
-		kfree(locations);
+	kfree(locations);
 	return status;
 }
 
@@ -2554,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 
 static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
-	struct path path = {
-		.mnt = nd->path.mnt,
+	struct path my_path = {
 		.dentry = dentry,
 	};
+	struct path *path = &my_path;
 	struct nfs4_state *state;
-	struct rpc_cred *cred;
-	fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
+	struct rpc_cred *cred = NULL;
+	fmode_t fmode = 0;
 	int status = 0;
 
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred)) {
-		status = PTR_ERR(cred);
-		goto out;
+	if (ctx != NULL) {
+		cred = ctx->cred;
+		path = &ctx->path;
+		fmode = ctx->mode;
 	}
-	state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
+	state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
-		goto out_putcred;
+		goto out;
 	}
 	d_add(dentry, igrab(state->inode));
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-		status = nfs4_intent_set_file(nd, &path, state, fmode);
+	if (ctx != NULL)
+		ctx->state = state;
 	else
-		nfs4_close_sync(&path, state, fmode);
-out_putcred:
-	put_rpccred(cred);
+		nfs4_close_sync(path, state, fmode);
 out:
 	return status;
 }
@@ -2641,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
+	res->seq_res.sr_slot = NULL;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
 
@@ -2648,7 +2563,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
 	struct nfs_removeres *res = task->tk_msg.rpc_resp;
 
-	nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
 	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
@@ -2656,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_renameargs *arg = msg->rpc_argp;
+	struct nfs_renameres *res = msg->rpc_resp;
+
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+	arg->bitmask = server->attr_bitmask;
+	res->server = server;
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+				 struct inode *new_dir)
+{
+	struct nfs_renameres *res = task->tk_msg.rpc_resp;
+
+	if (!nfs4_sequence_done(task, &res->seq_res))
+		return 0;
+	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+		return 0;
+
+	update_changeattr(old_dir, &res->old_cinfo);
+	nfs_post_op_update_inode(old_dir, res->old_fattr);
+	update_changeattr(new_dir, &res->new_cinfo);
+	nfs_post_op_update_inode(new_dir, res->new_fattr);
+	return 1;
+}
+
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
 {
 	struct nfs_server *server = NFS_SERVER(old_dir);
-	struct nfs4_rename_arg arg = {
+	struct nfs_renameargs arg = {
 		.old_dir = NFS_FH(old_dir),
 		.new_dir = NFS_FH(new_dir),
 		.old_name = old_name,
 		.new_name = new_name,
 		.bitmask = server->attr_bitmask,
 	};
-	struct nfs4_rename_res res = {
+	struct nfs_renameres res = {
 		.server = server,
 	};
 	struct rpc_message msg = {
@@ -2881,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 }
 
 static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+		u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = &page,
+		.pages = pages,
 		.pgbase = 0,
 		.count = count,
 		.bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+		.plus = plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
@@ -2917,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 
 static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+		u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
 				_nfs4_proc_readdir(dentry, cred, cookie,
-					page, count, plus),
+					pages, count, plus),
 				&exception);
 	} while (exception.retry);
 	return err;
@@ -3093,7 +3038,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 
 	dprintk("--> %s\n", __func__);
 
-	nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, server->nfs_client);
@@ -3116,8 +3062,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->inode;
 	
-	nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
-			   task->tk_status);
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3145,8 +3091,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->inode;
 	
-	nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
-			   task->tk_status);
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		return -EAGAIN;
@@ -3196,10 +3143,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 			nfs4_schedule_state_recovery(clp);
 		return;
 	}
-	spin_lock(&clp->cl_lock);
-	if (time_before(clp->cl_last_renewal,timestamp))
-		clp->cl_last_renewal = timestamp;
-	spin_unlock(&clp->cl_lock);
+	do_renew_lease(clp, timestamp);
 }
 
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3240,10 +3184,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
 	if (status < 0)
 		return status;
-	spin_lock(&clp->cl_lock);
-	if (time_before(clp->cl_last_renewal,now))
-		clp->cl_last_renewal = now;
-	spin_unlock(&clp->cl_lock);
+	do_renew_lease(clp, now);
 	return 0;
 }
 
@@ -3464,9 +3405,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 
 static int
-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
-	if (!clp || task->tk_status >= 0)
+	struct nfs_client *clp = server->nfs_client;
+
+	if (task->tk_status >= 0)
 		return 0;
 	switch(task->tk_status) {
 		case -NFS4ERR_ADMIN_REVOKED:
@@ -3477,9 +3420,6 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			nfs4_state_mark_reclaim_nograce(clp, state);
 			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
-			if (state == NULL)
-				break;
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
 			goto do_state_recovery;
@@ -3498,8 +3438,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
-			if (server)
-				nfs_inc_server_stats(server, NFSIOS_DELAY);
+			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 		case -EKEYEXPIRED:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3520,12 +3459,6 @@ do_state_recovery:
 	return -EAGAIN;
 }
 
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
-{
-	return _nfs4_async_handle_error(task, server, server->nfs_client, state);
-}
-
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		unsigned short port, struct rpc_cred *cred,
 		struct nfs4_setclientid_res *res)
@@ -3620,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 			case -NFS4ERR_RESOURCE:
 				/* The IBM lawyers misread another document! */
 			case -NFS4ERR_DELAY:
-			case -EKEYEXPIRED:
 				err = nfs4_delay(clp->cl_rpcclient, &timeout);
 		}
 	} while (err == 0);
@@ -3641,8 +3573,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegreturndata *data = calldata;
 
-	nfs4_sequence_done(data->res.server, &data->res.seq_res,
-			task->tk_status);
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
 
 	switch (task->tk_status) {
 	case -NFS4ERR_STALE_STATEID:
@@ -3672,7 +3604,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 	d_data = (struct nfs4_delegreturndata *)data;
 
-	if (nfs4_setup_sequence(d_data->res.server->nfs_client,
+	if (nfs4_setup_sequence(d_data->res.server,
 				&d_data->args.seq_args,
 				&d_data->res.seq_res, 1, task))
 		return;
@@ -3715,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
-	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
@@ -3868,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	p->arg.fl = &p->fl;
 	p->arg.seqid = seqid;
 	p->res.seqid = seqid;
-	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->arg.stateid = &lsp->ls_stateid;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
@@ -3892,9 +3822,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
 
-	nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
-			   task->tk_status);
-	if (RPC_ASSASSINATED(task))
+	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
 		return;
 	switch (task->tk_status) {
 		case 0:
@@ -3927,7 +3855,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		return;
 	}
 	calldata->timestamp = jiffies;
-	if (nfs4_setup_sequence(calldata->server->nfs_client,
+	if (nfs4_setup_sequence(calldata->server,
 				&calldata->arg.seq_args,
 				&calldata->res.seq_res, 1, task))
 		return;
@@ -4050,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
 	p->res.lock_seqid = p->arg.lock_seqid;
-	p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	p->lsp = lsp;
 	p->server = server;
 	atomic_inc(&lsp->ls_count);
@@ -4082,7 +4009,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	} else
 		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
-	if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
+	if (nfs4_setup_sequence(data->server,
+				&data->arg.seq_args,
 				&data->res.seq_res, 1, task))
 		return;
 	rpc_call_start(task);
@@ -4101,12 +4029,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 
 	dprintk("%s: begin!\n", __func__);
 
-	nfs4_sequence_done(data->server, &data->res.seq_res,
-			task->tk_status);
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
 
 	data->rpc_status = task->tk_status;
-	if (RPC_ASSASSINATED(task))
-		goto out;
 	if (data->arg.new_lock_owner != 0) {
 		if (data->rpc_status == 0)
 			nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4238,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -4263,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
-		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -4409,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
 				err = 0;
 				goto out;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				err = 0;
+				goto out;
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				err = 0;
 				goto out;
 			case -NFS4ERR_DELAY:
-			case -EKEYEXPIRED:
 				break;
 		}
 		err = nfs4_handle_exception(server, err, &exception);
@@ -4424,6 +4357,34 @@ out:
 	return err;
 }
 
+static void nfs4_release_lockowner_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+	.rpc_release = nfs4_release_lockowner_release,
+};
+
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_release_lockowner_args *args;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+	};
+
+	if (server->nfs_client->cl_mvops->minor_version != 0)
+		return;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (!args)
+		return;
+	args->lock_owner.clientid = server->nfs_client->cl_clientid;
+	args->lock_owner.id = lsp->ls_id.id;
+	msg.rpc_argp = args;
+	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
+
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4611,11 +4572,11 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 			(struct nfs4_get_lease_time_data *)calldata;
 
 	dprintk("--> %s\n", __func__);
-	nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+	if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+		return;
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
-	case -EKEYEXPIRED:
 		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
 		rpc_delay(task, NFS4_POLL_RETRY_MIN);
 		task->tk_status = 0;
@@ -4655,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 	};
 	int status;
 
-	res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	dprintk("--> %s\n", __func__);
 	task = rpc_run_task(&task_setup);
 
@@ -4805,13 +4765,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	if (!session)
 		return NULL;
 
-	/*
-	 * The create session reply races with the server back
-	 * channel probe. Mark the client NFS_CS_SESSION_INITING
-	 * so that the client back channel can find the
-	 * nfs_client struct
-	 */
-	clp->cl_cons_state = NFS_CS_SESSION_INITING;
 	init_completion(&session->complete);
 
 	tbl = &session->fc_slot_table;
@@ -4824,6 +4777,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	spin_lock_init(&tbl->slot_tbl_lock);
 	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
 
+	session->session_state = 1<<NFS4_SESSION_INITING;
+
 	session->clp = clp;
 	return session;
 }
@@ -4887,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 		args->bc_attrs.max_reqs);
 }
 
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-	if (rcvd <= sent)
-		return 0;
-	printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-		"sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
-	return -EINVAL;
+	struct nfs4_channel_attrs *sent = &args->fc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+
+	if (rcvd->headerpadsz > sent->headerpadsz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz > sent->max_resp_sz)
+		return -EINVAL;
+	/*
+	 * Our requested max_ops is the minimum we need; we're not
+	 * prepared to break up compounds into smaller pieces than that.
+	 * So, no point even trying to continue if the server won't
+	 * cooperate:
+	 */
+	if (rcvd->max_ops < sent->max_ops)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
 }
 
-#define _verify_fore_channel_attr(_name_) \
-	_verify_channel_attr("fore", #_name_, \
-			     args->fc_attrs._name_, \
-			     session->fc_attrs._name_)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+{
+	struct nfs4_channel_attrs *sent = &args->bc_attrs;
+	struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
 
-#define _verify_back_channel_attr(_name_) \
-	_verify_channel_attr("back", #_name_, \
-			     args->bc_attrs._name_, \
-			     session->bc_attrs._name_)
+	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz < sent->max_resp_sz)
+		return -EINVAL;
+	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+		return -EINVAL;
+	/* These would render the backchannel useless: */
+	if (rcvd->max_ops  == 0)
+		return -EINVAL;
+	if (rcvd->max_reqs == 0)
+		return -EINVAL;
+	return 0;
+}
 
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
 				     struct nfs4_session *session)
 {
-	int ret = 0;
-
-	ret |= _verify_fore_channel_attr(headerpadsz);
-	ret |= _verify_fore_channel_attr(max_resp_sz);
-	ret |= _verify_fore_channel_attr(max_ops);
-
-	ret |= _verify_back_channel_attr(headerpadsz);
-	ret |= _verify_back_channel_attr(max_rqst_sz);
-	ret |= _verify_back_channel_attr(max_resp_sz);
-	ret |= _verify_back_channel_attr(max_resp_sz_cached);
-	ret |= _verify_back_channel_attr(max_ops);
-	ret |= _verify_back_channel_attr(max_reqs);
+	int ret;
 
-	return ret;
+	ret = nfs4_verify_fore_channel_attrs(args, session);
+	if (ret)
+		return ret;
+	return nfs4_verify_back_channel_attrs(args, session);
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5040,6 +5002,10 @@ int nfs4_init_session(struct nfs_server *server)
 	if (!nfs4_has_session(clp))
 		return 0;
 
+	session = clp->cl_session;
+	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+		return 0;
+
 	rsize = server->rsize;
 	if (rsize == 0)
 		rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5047,7 +5013,6 @@ int nfs4_init_session(struct nfs_server *server)
 	if (wsize == 0)
 		wsize = NFS_MAX_FILE_IO_SIZE;
 
-	session = clp->cl_session;
 	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
 	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
 
@@ -5060,69 +5025,69 @@ int nfs4_init_session(struct nfs_server *server)
 /*
  * Renew the cl_session lease.
  */
-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
-{
+struct nfs4_sequence_data {
+	struct nfs_client *clp;
 	struct nfs4_sequence_args args;
 	struct nfs4_sequence_res res;
-
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
-		.rpc_cred = cred,
-	};
-
-	args.sa_cache_this = 0;
-
-	return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
-				       &res, args.sa_cache_this, 1);
-}
+};
 
 static void nfs41_sequence_release(void *data)
 {
-	struct nfs_client *clp = (struct nfs_client *)data;
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
 
 	if (atomic_read(&clp->cl_count) > 1)
 		nfs4_schedule_state_renewal(clp);
 	nfs_put_client(clp);
+	kfree(calldata);
+}
+
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		return -EAGAIN;
+	default:
+		nfs4_schedule_state_recovery(clp);
+	}
+	return 0;
 }
 
 static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_client *clp = (struct nfs_client *)data;
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
 
-	nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
+	if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+		return;
 
 	if (task->tk_status < 0) {
 		dprintk("%s ERROR %d\n", __func__, task->tk_status);
 		if (atomic_read(&clp->cl_count) == 1)
 			goto out;
 
-		if (_nfs4_async_handle_error(task, NULL, clp, NULL)
-								== -EAGAIN) {
-			nfs_restart_rpc(task, clp);
+		if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
 			return;
 		}
 	}
 	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 out:
-	kfree(task->tk_msg.rpc_argp);
-	kfree(task->tk_msg.rpc_resp);
-
 	dprintk("<-- %s\n", __func__);
 }
 
 static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_client *clp;
+	struct nfs4_sequence_data *calldata = data;
+	struct nfs_client *clp = calldata->clp;
 	struct nfs4_sequence_args *args;
 	struct nfs4_sequence_res *res;
 
-	clp = (struct nfs_client *)data;
 	args = task->tk_msg.rpc_argp;
 	res = task->tk_msg.rpc_resp;
 
-	if (nfs4_setup_sequence(clp, args, res, 0, task))
+	if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
 		return;
 	rpc_call_start(task);
 }
@@ -5133,32 +5098,66 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
 	.rpc_release = nfs41_sequence_release,
 };
 
-static int nfs41_proc_async_sequence(struct nfs_client *clp,
-				     struct rpc_cred *cred)
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 {
-	struct nfs4_sequence_args *args;
-	struct nfs4_sequence_res *res;
+	struct nfs4_sequence_data *calldata;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
 		.rpc_cred = cred,
 	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_sequence_ops,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+	};
 
 	if (!atomic_inc_not_zero(&clp->cl_count))
-		return -EIO;
-	args = kzalloc(sizeof(*args), GFP_NOFS);
-	res = kzalloc(sizeof(*res), GFP_NOFS);
-	if (!args || !res) {
-		kfree(args);
-		kfree(res);
+		return ERR_PTR(-EIO);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (calldata == NULL) {
 		nfs_put_client(clp);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
-	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
-	msg.rpc_argp = args;
-	msg.rpc_resp = res;
+	msg.rpc_argp = &calldata->args;
+	msg.rpc_resp = &calldata->res;
+	calldata->clp = clp;
+	task_setup_data.callback_data = calldata;
 
-	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-			      &nfs41_sequence_ops, (void *)clp);
+	return rpc_run_task(&task_setup_data);
+}
+
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret = 0;
+
+	task = _nfs41_proc_sequence(clp, cred);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else
+		rpc_put_task(task);
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
+}
+
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_proc_sequence(clp, cred);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret)
+		ret = task->tk_status;
+	rpc_put_task(task);
+out:
+	dprintk("<-- %s status=%d\n", __func__, ret);
+	return ret;
 }
 
 struct nfs4_reclaim_complete_data {
@@ -5172,13 +5171,30 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
 	struct nfs4_reclaim_complete_data *calldata = data;
 
 	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-	if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
+	if (nfs41_setup_sequence(calldata->clp->cl_session,
+				&calldata->arg.seq_args,
 				&calldata->res.seq_res, 0, task))
 		return;
 
 	rpc_call_start(task);
 }
 
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+	switch(task->tk_status) {
+	case 0:
+	case -NFS4ERR_COMPLETE_ALREADY:
+	case -NFS4ERR_WRONG_CRED: /* What to do here? */
+		break;
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, NFS4_POLL_RETRY_MAX);
+		return -EAGAIN;
+	default:
+		nfs4_schedule_state_recovery(clp);
+	}
+	return 0;
+}
+
 static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_reclaim_complete_data *calldata = data;
@@ -5186,32 +5202,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
 	struct nfs4_sequence_res *res = &calldata->res.seq_res;
 
 	dprintk("--> %s\n", __func__);
-	nfs41_sequence_done(clp, res, task->tk_status);
-	switch (task->tk_status) {
-	case 0:
-	case -NFS4ERR_COMPLETE_ALREADY:
-		break;
-	case -NFS4ERR_BADSESSION:
-	case -NFS4ERR_DEADSESSION:
-		/*
-		 * Handle the session error, but do not retry the operation, as
-		 * we have no way of telling whether the clientid had to be
-		 * reset before we got our reply.  If reset, a new wave of
-		 * reclaim operations will follow, containing their own reclaim
-		 * complete.  We don't want our retry to get on the way of
-		 * recovery by incorrectly indicating to the server that we're
-		 * done reclaiming state since the process had to be restarted.
-		 */
-		_nfs4_async_handle_error(task, NULL, clp, NULL);
-		break;
-	default:
-		if (_nfs4_async_handle_error(
-				task, NULL, clp, NULL) == -EAGAIN) {
-			rpc_restart_call_prepare(task);
-			return;
-		}
-	}
+	if (!nfs41_sequence_done(task, res))
+		return;
 
+	if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+		rpc_restart_call_prepare(task);
+		return;
+	}
 	dprintk("<-- %s\n", __func__);
 }
 
@@ -5252,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 		goto out;
 	calldata->clp = clp;
 	calldata->arg.one_fs = 0;
-	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 
 	msg.rpc_argp = &calldata->arg;
 	msg.rpc_resp = &calldata->res;
@@ -5268,6 +5264,147 @@ out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
 }
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct inode *ino = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(ino);
+
+	dprintk("--> %s\n", __func__);
+	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+				&lgp->res.seq_res, 0, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LAYOUTTRYLATER:
+	case -NFS4ERR_RECALLCONFLICT:
+		task->tk_status = -NFS4ERR_DELAY;
+		/* Fall through */
+	default:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+	lgp->status = task->tk_status;
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+	struct nfs4_layoutget *lgp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	put_layout_hdr(lgp->args.inode);
+	if (lgp->res.layout.buf != NULL)
+		free_page((unsigned long) lgp->res.layout.buf);
+	put_nfs_open_context(lgp->args.ctx);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+	.rpc_call_prepare = nfs4_layoutget_prepare,
+	.rpc_call_done = nfs4_layoutget_done,
+	.rpc_release = nfs4_layoutget_release,
+};
+
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+		.rpc_argp = &lgp->args,
+		.rpc_resp = &lgp->res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutget_call_ops,
+		.callback_data = lgp,
+		.flags = RPC_TASK_ASYNC,
+	};
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+	if (lgp->res.layout.buf == NULL) {
+		nfs4_layoutget_release(lgp);
+		return -ENOMEM;
+	}
+
+	lgp->res.seq_res.sr_slot = NULL;
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0)
+		goto out;
+	status = lgp->status;
+	if (status != 0)
+		goto out;
+	status = pnfs_layout_process(lgp);
+out:
+	rpc_put_task(task);
+	dprintk("<-- %s status=%d\n", __func__, status);
+	return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_getdeviceinfo_args args = {
+		.pdev = pdev,
+	};
+	struct nfs4_getdeviceinfo_res res = {
+		.pdev = pdev,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
+	dprintk("<-- %s status=%d\n", __func__, status);
+
+	return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(server,
+					_nfs4_proc_getdeviceinfo(server, pdev),
+					&exception);
+	} while (exception.retry);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5325,28 +5462,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
-/*
- * Per minor version reboot and network partition recovery ops
- */
-
-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
-	&nfs40_reboot_recovery_ops,
-#if defined(CONFIG_NFS_V4_1)
-	&nfs41_reboot_recovery_ops,
-#endif
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+	.minor_version = 0,
+	.call_sync = _nfs4_call_sync,
+	.validate_stateid = nfs4_validate_delegation_stateid,
+	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
+	.state_renewal_ops = &nfs40_state_renewal_ops,
 };
 
-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
-	&nfs40_nograce_recovery_ops,
 #if defined(CONFIG_NFS_V4_1)
-	&nfs41_nograce_recovery_ops,
-#endif
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+	.minor_version = 1,
+	.call_sync = _nfs4_call_sync_session,
+	.validate_stateid = nfs41_validate_delegation_stateid,
+	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+	.state_renewal_ops = &nfs41_state_renewal_ops,
 };
+#endif
 
-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
-	&nfs40_state_renewal_ops,
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+	[0] = &nfs_v4_0_minor_ops,
 #if defined(CONFIG_NFS_V4_1)
-	&nfs41_state_renewal_ops,
+	[1] = &nfs_v4_1_minor_ops,
 #endif
 };
 
@@ -5376,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.unlink_setup	= nfs4_proc_unlink_setup,
 	.unlink_done	= nfs4_proc_unlink_done,
 	.rename		= nfs4_proc_rename,
+	.rename_setup	= nfs4_proc_rename_setup,
+	.rename_done	= nfs4_proc_rename_done,
 	.link		= nfs4_proc_link,
 	.symlink	= nfs4_proc_symlink,
 	.mkdir		= nfs4_proc_mkdir,
@@ -5396,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.close_context  = nfs4_close_context,
+	.open_context	= nfs4_atomic_open,
 };
 
 /*
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b7..72b6c580af1 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
 void
 nfs4_renew_state(struct work_struct *work)
 {
-	struct nfs4_state_maintenance_ops *ops;
+	const struct nfs4_state_maintenance_ops *ops;
 	struct nfs_client *clp =
 		container_of(work, struct nfs_client, cl_renewd.work);
 	struct rpc_cred *cred;
 	long lease;
 	unsigned long last, now;
 
-	ops = nfs4_state_renewal_ops[clp->cl_minorversion];
+	ops = clp->cl_mvops->state_renewal_ops;
 	dprintk("%s: start\n", __func__);
 	/* Are there any active superblocks? */
 	if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 34acf5926fd..f575a312673 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 
@@ -53,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define OPENOWNER_POOL_SIZE	8
 
@@ -145,7 +147,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	struct nfs4_session *ses = clp->cl_session;
 	int max_slots;
 
-	if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+	if (ses == NULL)
+		return;
+	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
 		spin_lock(&ses->fc_slot_table.slot_tbl_lock);
 		max_slots = ses->fc_slot_table.max_slots;
 		while (max_slots--) {
@@ -167,7 +171,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
 	struct nfs4_slot_table *tbl = &ses->fc_slot_table;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
+	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
 	if (tbl->highest_used_slotid != -1) {
 		INIT_COMPLETION(ses->complete);
 		spin_unlock(&tbl->slot_tbl_lock);
@@ -371,7 +375,6 @@ nfs4_alloc_state_owner(void)
 		return NULL;
 	spin_lock_init(&sp->so_lock);
 	INIT_LIST_HEAD(&sp->so_states);
-	INIT_LIST_HEAD(&sp->so_delegations);
 	rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
 	sp->so_seqid.sequence = &sp->so_sequence;
 	spin_lock_init(&sp->so_sequence.lock);
@@ -384,7 +387,7 @@ static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
 	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-		struct nfs_client *clp = sp->so_client;
+		struct nfs_client *clp = sp->so_server->nfs_client;
 
 		spin_lock(&clp->cl_lock);
 		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -406,7 +409,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	new = nfs4_alloc_state_owner();
 	if (new == NULL)
 		return NULL;
-	new->so_client = clp;
 	new->so_server = server;
 	new->so_cred = cred;
 	spin_lock(&clp->cl_lock);
@@ -423,7 +425,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs_client *clp = sp->so_client;
+	struct nfs_client *clp = sp->so_server->nfs_client;
 	struct rpc_cred *cred = sp->so_cred;
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -602,12 +604,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
  * that is compatible with current->files
  */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
 	struct nfs4_lock_state *pos;
 	list_for_each_entry(pos, &state->lock_states, ls_locks) {
-		if (pos->ls_owner != fl_owner)
+		if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
 			continue;
+		switch (pos->ls_owner.lo_type) {
+		case NFS4_POSIX_LOCK_TYPE:
+			if (pos->ls_owner.lo_u.posix_owner != fl_owner)
+				continue;
+			break;
+		case NFS4_FLOCK_LOCK_TYPE:
+			if (pos->ls_owner.lo_u.flock_owner != fl_pid)
+				continue;
+		}
 		atomic_inc(&pos->ls_count);
 		return pos;
 	}
@@ -619,10 +630,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
 	struct nfs4_lock_state *lsp;
-	struct nfs_client *clp = state->owner->so_client;
+	struct nfs_client *clp = state->owner->so_server->nfs_client;
 
 	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
 	if (lsp == NULL)
@@ -633,7 +644,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	lsp->ls_seqid.sequence = &lsp->ls_sequence;
 	atomic_set(&lsp->ls_count, 1);
 	lsp->ls_state = state;
-	lsp->ls_owner = fl_owner;
+	lsp->ls_owner.lo_type = type;
+	switch (lsp->ls_owner.lo_type) {
+	case NFS4_FLOCK_LOCK_TYPE:
+		lsp->ls_owner.lo_u.flock_owner = fl_pid;
+		break;
+	case NFS4_POSIX_LOCK_TYPE:
+		lsp->ls_owner.lo_u.posix_owner = fl_owner;
+		break;
+	default:
+		kfree(lsp);
+		return NULL;
+	}
 	spin_lock(&clp->cl_lock);
 	nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
 	spin_unlock(&clp->cl_lock);
@@ -643,7 +665,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-	struct nfs_client *clp = lsp->ls_state->owner->so_client;
+	struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
 
 	spin_lock(&clp->cl_lock);
 	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -657,13 +679,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
 {
 	struct nfs4_lock_state *lsp, *new = NULL;
 	
 	for(;;) {
 		spin_lock(&state->state_lock);
-		lsp = __nfs4_find_lock_state(state, owner);
+		lsp = __nfs4_find_lock_state(state, owner, pid, type);
 		if (lsp != NULL)
 			break;
 		if (new != NULL) {
@@ -674,7 +696,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
 			break;
 		}
 		spin_unlock(&state->state_lock);
-		new = nfs4_alloc_lock_state(state, owner);
+		new = nfs4_alloc_lock_state(state, owner, pid, type);
 		if (new == NULL)
 			return NULL;
 	}
@@ -701,6 +723,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
+	if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+		nfs4_release_lockowner(lsp);
 	nfs4_free_lock_state(lsp);
 }
 
@@ -728,7 +752,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
 	if (fl->fl_ops != NULL)
 		return 0;
-	lsp = nfs4_get_lock_state(state, fl->fl_owner);
+	if (fl->fl_flags & FL_POSIX)
+		lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
+	else if (fl->fl_flags & FL_FLOCK)
+		lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+	else
+		return -EINVAL;
 	if (lsp == NULL)
 		return -ENOMEM;
 	fl->fl_u.nfs4_fl.owner = lsp;
@@ -740,7 +769,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
  * Byte-range lock aware utility to initialize the stateid of read/write
  * requests.
  */
-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
+void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
 {
 	struct nfs4_lock_state *lsp;
 	int seq;
@@ -753,7 +782,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
 		return;
 
 	spin_lock(&state->state_lock);
-	lsp = __nfs4_find_lock_state(state, fl_owner);
+	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
 	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
 		memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
 	spin_unlock(&state->state_lock);
@@ -943,13 +972,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
 	/* Protect inode->i_flock using the BKL */
-	lock_kernel();
+	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		unlock_kernel();
+		unlock_flocks();
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 			case 0:
@@ -976,9 +1005,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				status = 0;
 		}
-		lock_kernel();
+		lock_flocks();
 	}
-	unlock_kernel();
+	unlock_flocks();
 out:
 	up_write(&nfsi->rwsem);
 	return status;
@@ -1036,16 +1065,24 @@ restart:
 				/* Mark the file as being 'closed' */
 				state->state = 0;
 				break;
+			case -EKEYEXPIRED:
+				/*
+				 * User RPCSEC_GSS context has expired.
+				 * We cannot recover this stateid now, so
+				 * skip it and allow recovery thread to
+				 * proceed.
+				 */
+				break;
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_BAD_STATEID:
 			case -NFS4ERR_RECLAIM_BAD:
 			case -NFS4ERR_RECLAIM_CONFLICT:
-				nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
 				break;
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_NO_GRACE:
-				nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+				nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_BADSESSION:
 			case -NFS4ERR_BADSLOT:
@@ -1111,17 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
 		(void)ops->reclaim_complete(clp);
 }
 
-static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 	struct rb_node *pos;
 	struct nfs4_state *state;
 
 	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-		return;
-
-	nfs4_reclaim_complete(clp,
-		nfs4_reboot_recovery_ops[clp->cl_minorversion]);
+		return 0;
 
 	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
 		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1135,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 	}
 
 	nfs_delegation_reap_unclaimed(clp);
+	return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+	if (!nfs4_state_clear_reclaim_reboot(clp))
+		return;
+	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
 }
 
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1149,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
+static void nfs4_warn_keyexpired(const char *s)
+{
+	printk_ratelimited(KERN_WARNING "Error: state manager"
+			" encountered RPCSEC_GSS session"
+			" expired against NFSv4 server %s.\n",
+			s);
+}
+
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
 	switch (error) {
@@ -1161,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_end_reclaim_reboot(clp);
+			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_EXPIRED:
@@ -1178,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 			/* Zero session reset errors */
 			return 0;
+		case -EKEYEXPIRED:
+			/* Nothing we can do */
+			nfs4_warn_keyexpired(clp->cl_hostname);
+			return 0;
 	}
 	return error;
 }
@@ -1211,8 +1265,8 @@ restart:
 static int nfs4_check_lease(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
-	struct nfs4_state_maintenance_ops *ops =
-		nfs4_state_renewal_ops[clp->cl_minorversion];
+	const struct nfs4_state_maintenance_ops *ops =
+		clp->cl_mvops->state_renewal_ops;
 	int status = -NFS4ERR_EXPIRED;
 
 	/* Is the client already known to have an expired lease? */
@@ -1235,8 +1289,8 @@ out:
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
-	struct nfs4_state_recovery_ops *ops =
-		nfs4_reboot_recovery_ops[clp->cl_minorversion];
+	const struct nfs4_state_recovery_ops *ops =
+		clp->cl_mvops->reboot_recovery_ops;
 	int status = -ENOENT;
 
 	cred = ops->get_clid_cred(clp);
@@ -1388,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
 		case -NFS4ERR_DELAY:
 		case -NFS4ERR_CLID_INUSE:
 		case -EAGAIN:
-		case -EKEYEXPIRED:
 			break;
 
+		case -EKEYEXPIRED:
+			nfs4_warn_keyexpired(clp->cl_hostname);
 		case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 					 * in nfs4_exchange_id */
 		default:
@@ -1421,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			}
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+			pnfs_destroy_all_layouts(clp);
 		}
 
 		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
@@ -1444,7 +1500,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 		/* First recover reboot state... */
 		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
-				nfs4_reboot_recovery_ops[clp->cl_minorversion]);
+				clp->cl_mvops->reboot_recovery_ops);
 			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
 			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
 				continue;
@@ -1458,7 +1514,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 		/* Now recover expired state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
-				nfs4_nograce_recovery_ops[clp->cl_minorversion]);
+				clp->cl_mvops->nograce_recovery_ops);
 			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
 			    test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
 			    test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c8dae4b26..f313c4cce7e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -202,14 +203,17 @@ static int nfs4_stat_to_errno(int);
 #define encode_link_maxsz	(op_encode_hdr_maxsz + \
 				nfs4_name_maxsz)
 #define decode_link_maxsz	(op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz	(7)
 #define encode_lock_maxsz	(op_encode_hdr_maxsz + \
 				 7 + \
-				 1 + encode_stateid_maxsz + 8)
+				 1 + encode_stateid_maxsz + 1 + \
+				 encode_lockowner_maxsz)
 #define decode_lock_denied_maxsz \
 				(8 + decode_lockowner_maxsz)
 #define decode_lock_maxsz	(op_decode_hdr_maxsz + \
 				 decode_lock_denied_maxsz)
-#define encode_lockt_maxsz	(op_encode_hdr_maxsz + 12)
+#define encode_lockt_maxsz	(op_encode_hdr_maxsz + 5 + \
+				encode_lockowner_maxsz)
 #define decode_lockt_maxsz	(op_decode_hdr_maxsz + \
 				 decode_lock_denied_maxsz)
 #define encode_locku_maxsz	(op_encode_hdr_maxsz + 3 + \
@@ -217,6 +221,11 @@ static int nfs4_stat_to_errno(int);
 				 4)
 #define decode_locku_maxsz	(op_decode_hdr_maxsz + \
 				 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+				(op_decode_hdr_maxsz)
 #define encode_access_maxsz	(op_encode_hdr_maxsz + 1)
 #define decode_access_maxsz	(op_decode_hdr_maxsz + 2)
 #define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
@@ -302,6 +311,19 @@ static int nfs4_stat_to_errno(int);
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+				1 /* layout type */ + \
+				1 /* opaque devaddr4 length */ + \
+				  /* devaddr4 payload is read into page */ \
+				1 /* notification bitmap length */ + \
+				1 /* notification bitmap */)
+#define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
+				encode_stateid_maxsz)
+#define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
+				decode_stateid_maxsz + \
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
@@ -471,6 +493,12 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_lockowner_maxsz)
 #define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -685,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
 					 decode_sequence_maxsz + \
 					 decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+				encode_sequence_maxsz +\
+				encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+				decode_sequence_maxsz + \
+				decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz	(compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz +        \
+				encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz	(compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz +        \
+				decode_layoutget_maxsz)
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
 				      compound_encode_hdr_maxsz +
@@ -744,7 +786,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	__be32 *p;
-	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth *auth = req->rq_cred->cr_auth;
 
 	/* initialize running count of expected bytes in reply.
 	 * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -802,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 					iap->ia_uid);
@@ -814,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 					iap->ia_gid);
@@ -1042,6 +1084,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
 	return fl->fl_end - fl->fl_start + 1;
 }
 
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 28);
+	p = xdr_encode_hyper(p, lowner->clientid);
+	*p++ = cpu_to_be32(16);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+	xdr_encode_hyper(p, lowner->id);
+}
+
 /*
  * opcode,type,reclaim,offset,length,new_lock_owner = 32
  * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1111,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
+		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-		p = xdr_encode_hyper(p, args->lock_owner.clientid);
-		*p++ = cpu_to_be32(16);
-		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-		xdr_encode_hyper(p, args->lock_owner.id);
+		encode_lockowner(xdr, &args->lock_owner);
 	}
 	else {
 		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1130,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 52);
+	p = reserve_space(xdr, 24);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-	p = xdr_encode_hyper(p, args->lock_owner.clientid);
-	*p++ = cpu_to_be32(16);
-	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-	xdr_encode_hyper(p, args->lock_owner.id);
+	encode_lockowner(xdr, &args->lock_owner);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1108,6 +1155,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	hdr->replen += decode_locku_maxsz;
 }
 
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+	encode_lockowner(xdr, lowner);
+	hdr->nops++;
+	hdr->replen += decode_release_lockowner_maxsz;
+}
+
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
 	int len = name->len;
@@ -1172,7 +1230,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 		break;
 	default:
 		clp = arg->server->nfs_client;
-		if (clp->cl_minorversion > 0) {
+		if (clp->cl_mvops->minor_version > 0) {
 			if (nfs4_has_persistent_session(clp)) {
 				*p = cpu_to_be32(NFS4_CREATE_GUARDED);
 				encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1382,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_putrootfh_maxsz;
 }
 
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
 {
 	nfs4_stateid stateid;
 	__be32 *p;
 
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
-		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
+		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1402,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_READ);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1355,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-	uint32_t attrs[2] = {
-		FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
-		FATTR4_WORD1_MOUNTED_ON_FILEID,
-	};
+	uint32_t attrs[2] = {0, 0};
+	uint32_t dircount = readdir->count >> 1;
 	__be32 *p;
 
+	if (readdir->plus) {
+		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+		attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		dircount >>= 1;
+	}
+	attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+	attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+	/* Switch to mounted_on_fileid if the server supports it */
+	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+		attrs[0] &= ~FATTR4_WORD0_FILEID;
+	else
+		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+
 	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
 	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
-	/* Switch to mounted_on_fileid if the server supports it */
-	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-		attrs[0] &= ~FATTR4_WORD0_FILEID;
-	else
-		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
 	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
@@ -1523,7 +1592,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_WRITE);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1696,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+		     const struct nfs4_getdeviceinfo_args *args,
+		     struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(OP_GETDEVICEINFO);
+	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+				    NFS4_DEVICEID4_SIZE);
+	*p++ = cpu_to_be32(args->pdev->layout_type);
+	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */
+	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
+	hdr->nops++;
+	hdr->replen += decode_getdeviceinfo_maxsz;
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+		      const struct nfs4_layoutget_args *args,
+		      struct compound_hdr *hdr)
+{
+	nfs4_stateid stateid;
+	__be32 *p;
+
+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+	*p++ = cpu_to_be32(OP_LAYOUTGET);
+	*p++ = cpu_to_be32(0);     /* Signal layout available */
+	*p++ = cpu_to_be32(args->type);
+	*p++ = cpu_to_be32(args->range.iomode);
+	p = xdr_encode_hyper(p, args->range.offset);
+	p = xdr_encode_hyper(p, args->range.length);
+	p = xdr_encode_hyper(p, args->minlength);
+	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+				args->ctx->state);
+	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+	*p = cpu_to_be32(args->maxcount);
+
+	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+		__func__,
+		args->type,
+		args->range.iomode,
+		(unsigned long)args->range.offset,
+		(unsigned long)args->range.length,
+		args->maxcount);
+	hdr->nops++;
+	hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" ENCODE ROUTINES.
  */
@@ -1704,7 +1825,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 {
 #if defined(CONFIG_NFS_V4_1)
 	if (args->sa_session)
-		return args->sa_session->clp->cl_minorversion;
+		return args->sa_session->clp->cl_mvops->minor_version;
 #endif /* CONFIG_NFS_V4_1 */
 	return 0;
 }
@@ -1793,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
  * Encode RENAME request
  */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2048,6 +2169,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 	return 0;
 }
 
+static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = 0,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 /*
  * Encode a READLINK request
  */
@@ -2395,7 +2530,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.minorversion = args->client->cl_minorversion,
+		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2548,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.minorversion = args->client->cl_minorversion,
+		.minorversion = args->client->cl_mvops->minor_version,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2566,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.minorversion = session->clp->cl_minorversion,
+		.minorversion = session->clp->cl_mvops->minor_version,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2499,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
 	return 0;
 }
 
+/*
+ * Encode GETDEVICEINFO request
+ */
+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+				      struct nfs4_getdeviceinfo_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->seq_args, &hdr);
+	encode_getdeviceinfo(&xdr, args, &hdr);
+
+	/* set up reply kvec. Subtract notification bitmap max size (2)
+	 * so that notification bitmap is put in xdr_buf tail */
+	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+			 args->pdev->pages, args->pdev->pgbase,
+			 args->pdev->pglen);
+
+	encode_nops(&hdr);
+	return 0;
+}
+
+/*
+ *  Encode LAYOUTGET request
+ */
+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+				  struct nfs4_layoutget_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_sequence(&xdr, &args->seq_args, &hdr);
+	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutget(&xdr, args, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2632,7 +2812,10 @@ out_overflow:
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
 {
 	if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
-		decode_attr_bitmap(xdr, bitmask);
+		int ret;
+		ret = decode_attr_bitmap(xdr, bitmask);
+		if (unlikely(ret < 0))
+			return ret;
 		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
 	} else
 		bitmask[0] = bitmask[1] = 0;
@@ -2804,6 +2987,56 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+	__be32 *p;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+	__be32 *p;
+	int len;
+
+	if (fh != NULL)
+		memset(fh, 0, sizeof(*fh));
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p);
+		if (len > NFS4_FHSIZE)
+			return -EIO;
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (fh != NULL) {
+			memcpy(fh->data, p, len);
+			fh->size = len;
+		}
+		bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
 	__be32 *p;
@@ -3477,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 	return status;
 }
 
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+				  struct timespec *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+		status = decode_attr_time(xdr, time);
+		bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+	}
+	dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+		(long)time->tv_nsec);
+	return status;
+}
+
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
 	int status = 0;
@@ -3700,29 +3951,14 @@ xdr_error:
 	return status;
 }
 
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_fattr *fattr, struct nfs_fh *fh,
 		const struct nfs_server *server, int may_sleep)
 {
-	__be32 *savep;
-	uint32_t attrlen,
-		 bitmap[2] = {0},
-		 type;
 	int status;
 	umode_t fmode = 0;
 	uint64_t fileid;
-
-	status = decode_op_hdr(xdr, OP_GETATTR);
-	if (status < 0)
-		goto xdr_error;
-
-	status = decode_attr_bitmap(xdr, bitmap);
-	if (status < 0)
-		goto xdr_error;
-
-	status = decode_attr_length(xdr, &attrlen, &savep);
-	if (status < 0)
-		goto xdr_error;
-
+	uint32_t type;
 
 	status = decode_attr_type(xdr, bitmap, &type);
 	if (status < 0)
@@ -3748,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = decode_attr_error(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_filehandle(xdr, bitmap, fh);
+	if (status < 0)
+		goto xdr_error;
+
 	status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
 	if (status < 0)
 		goto xdr_error;
@@ -3818,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 		fattr->valid |= status;
 	}
 
+xdr_error:
+	dprintk("%s: xdr returned %d\n", __func__, -status);
+	return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+	__be32 *savep;
+	uint32_t attrlen,
+		 bitmap[2] = {0};
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETATTR);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+	if (status < 0)
+		goto xdr_error;
+
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
 }
 
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server, int may_sleep)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
+
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+					 uint32_t *layouttype)
+{
+	uint32_t *p;
+	int num;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	num = be32_to_cpup(p);
+
+	/* pNFS is not supported by the underlying file system */
+	if (num == 0) {
+		*layouttype = 0;
+		return 0;
+	}
+	if (num > 1)
+		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+			"per filesystem not supported\n", __func__);
+
+	/* Decode and set first layout type, move xdr->p past unused types */
+	p = xdr_inline_decode(xdr, num * 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	*layouttype = be32_to_cpup(p);
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+				uint32_t *layouttype)
+{
+	int status = 0;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+		return -EIO;
+	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+	} else
+		*layouttype = 0;
+	return status;
+}
 
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
@@ -3850,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
 		goto xdr_error;
 	fsinfo->wtpref = fsinfo->wtmax;
+	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+	if (status != 0)
+		goto xdr_error;
+	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+	if (status != 0)
+		goto xdr_error;
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -3906,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	__be32 *p;
 	uint32_t namelen, type;
 
-	p = xdr_inline_decode(xdr, 32);
+	p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
 	if (unlikely(!p))
 		goto out_overflow;
-	p = xdr_decode_hyper(p, &offset);
+	p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
 	p = xdr_decode_hyper(p, &length);
-	type = be32_to_cpup(p++);
-	if (fl != NULL) {
+	type = be32_to_cpup(p++); /* 4 byte read */
+	if (fl != NULL) { /* manipulate file lock */
 		fl->fl_start = (loff_t)offset;
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
 		if (length == ~(uint64_t)0)
@@ -3922,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 			fl->fl_type = F_RDLCK;
 		fl->fl_pid = 0;
 	}
-	p = xdr_decode_hyper(p, &clientid);
-	namelen = be32_to_cpup(p);
-	p = xdr_inline_decode(xdr, namelen);
+	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
+	p = xdr_inline_decode(xdr, namelen); /* variable size field */
 	if (likely(p))
 		return -NFS4ERR_DENIED;
 out_overflow:
@@ -3973,6 +4312,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	return status;
 }
 
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
 static int decode_lookup(struct xdr_stream *xdr)
 {
 	return decode_op_hdr(xdr, OP_LOOKUP);
@@ -4151,12 +4495,9 @@ out_overflow:
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
 {
 	struct xdr_buf	*rcvbuf = &req->rq_rcv_buf;
-	struct page	*page = *rcvbuf->pages;
 	struct kvec	*iov = rcvbuf->head;
 	size_t		hdrlen;
 	u32		recvd, pglen = rcvbuf->page_len;
-	__be32		*end, *entry, *p, *kaddr;
-	unsigned int	nr = 0;
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
@@ -4176,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 		pglen = recvd;
 	xdr_read_pages(xdr, pglen);
 
-	BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-	kaddr = p = kmap_atomic(page, KM_USER0);
-	end = p + ((pglen + readdir->pgbase) >> 2);
-	entry = p;
-
-	/* Make sure the packet actually has a value_follows and EOF entry */
-	if ((entry + 1) > end)
-		goto short_pkt;
-
-	for (; *p++; nr++) {
-		u32 len, attrlen, xlen;
-		if (end - p < 3)
-			goto short_pkt;
-		dprintk("cookie = %Lu, ", *((unsigned long long *)p));
-		p += 2;			/* cookie */
-		len = ntohl(*p++);	/* filename length */
-		if (len > NFS4_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len 0x%x)\n",
-					len);
-			goto err_unmap;
-		}
-		xlen = XDR_QUADLEN(len);
-		if (end - p < xlen + 1)
-			goto short_pkt;
-		dprintk("filename = %*s\n", len, (char *)p);
-		p += xlen;
-		len = ntohl(*p++);	/* bitmap length */
-		if (end - p < len + 1)
-			goto short_pkt;
-		p += len;
-		attrlen = XDR_QUADLEN(ntohl(*p++));
-		if (end - p < attrlen + 2)
-			goto short_pkt;
-		p += attrlen;		/* attributes */
-		entry = p;
-	}
-	/*
-	 * Apparently some server sends responses that are a valid size, but
-	 * contain no entries, and have value_follows==0 and EOF==0. For
-	 * those, just set the EOF marker.
-	 */
-	if (!nr && entry[1] == 0) {
-		dprintk("NFS: readdir reply truncated!\n");
-		entry[1] = 1;
-	}
-out:
-	kunmap_atomic(kaddr, KM_USER0);
+
 	return 0;
-short_pkt:
-	/*
-	 * When we get a short packet there are 2 possibilities. We can
-	 * return an error, or fix up the response to look like a valid
-	 * response and return what we have so far. If there are no
-	 * entries and the packet was short, then return -EIO. If there
-	 * are valid entries in the response, return them and pretend that
-	 * the call was successful, but incomplete. The caller can retry the
-	 * readdir starting at the last cookie.
-	 */
-	dprintk("%s: short packet at entry %d\n", __func__, nr);
-	entry[0] = entry[1] = 0;
-	if (nr)
-		goto out;
-err_unmap:
-	kunmap_atomic(kaddr, KM_USER0);
-	return -errno_NFSERR_IO;
 }
 
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4250,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	size_t hdrlen;
 	u32 len, recvd;
 	__be32 *p;
-	char *kaddr;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_READLINK);
@@ -4281,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	 * and and null-terminate the text (the VFS expects
 	 * null-termination).
 	 */
-	kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-	kaddr[len+rcvbuf->page_base] = '\0';
-	kunmap_atomic(kaddr, KM_USER0);
+	xdr_terminate_string(rcvbuf, len);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4619,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
 			   struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
-	struct nfs4_slot *slot;
 	struct nfs4_sessionid id;
 	u32 dummy;
 	int status;
@@ -4651,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_overflow;
 
 	/* seqid */
-	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
 	dummy = be32_to_cpup(p++);
-	if (dummy != slot->seq_nr) {
+	if (dummy != res->sr_slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out_err;
 	}
 	/* slot id */
 	dummy = be32_to_cpup(p++);
-	if (dummy != res->sr_slotid) {
+	if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
@@ -4682,6 +4955,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+				struct pnfs_device *pdev)
+{
+	__be32 *p;
+	uint32_t len, type;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+	if (status) {
+		if (status == -ETOOSMALL) {
+			p = xdr_inline_decode(xdr, 4);
+			if (unlikely(!p))
+				goto out_overflow;
+			pdev->mincount = be32_to_cpup(p);
+			dprintk("%s: Min count too small. mincnt = %u\n",
+				__func__, pdev->mincount);
+		}
+		return status;
+	}
+
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+	type = be32_to_cpup(p++);
+	if (type != pdev->layout_type) {
+		dprintk("%s: layout mismatch req: %u pdev: %u\n",
+			__func__, pdev->layout_type, type);
+		return -EINVAL;
+	}
+	/*
+	 * Get the length of the opaque device_addr4. xdr_read_pages places
+	 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+	 * and places the remaining xdr data in xdr_buf->tail
+	 */
+	pdev->mincount = be32_to_cpup(p);
+	xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+
+	/* Parse notification bitmap, verifying that it is zero. */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	len = be32_to_cpup(p);
+	if (len) {
+		int i;
+
+		p = xdr_inline_decode(xdr, 4 * len);
+		if (unlikely(!p))
+			goto out_overflow;
+		for (i = 0; i < len; i++, p++) {
+			if (be32_to_cpup(p)) {
+				dprintk("%s: notifications not supported\n",
+					__func__);
+				return -EIO;
+			}
+		}
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+			    struct nfs4_layoutget_res *res)
+{
+	__be32 *p;
+	int status;
+	u32 layout_count;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTGET);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->return_on_close = be32_to_cpup(p++);
+	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+	layout_count = be32_to_cpup(p);
+	if (!layout_count) {
+		dprintk("%s: server responded with empty layout array\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	p = xdr_inline_decode(xdr, 24);
+	if (unlikely(!p))
+		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->range.offset);
+	p = xdr_decode_hyper(p, &res->range.length);
+	res->range.iomode = be32_to_cpup(p++);
+	res->type = be32_to_cpup(p++);
+
+	status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+	if (unlikely(status))
+		return status;
+
+	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+		__func__,
+		(unsigned long)res->range.offset,
+		(unsigned long)res->range.length,
+		res->range.iomode,
+		res->type,
+		res->layout.len);
+
+	/* nfs4_proc_layoutget allocated a single page */
+	if (res->layout.len > PAGE_SIZE)
+		return -ENOMEM;
+	memcpy(res->layout.buf, p, res->layout.len);
+
+	if (layout_count > 1) {
+		/* We only handle a length one array at the moment.  Any
+		 * further entries are just ignored.  Note that this means
+		 * the client may see a response that is less than the
+		 * minimum it requested.
+		 */
+		dprintk("%s: server responded with %d layouts, dropping tail\n",
+			__func__, layout_count);
+	}
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" DECODE ROUTINES.
  */
@@ -4824,7 +5225,7 @@ out:
 /*
  * Decode RENAME response
  */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -5259,6 +5660,19 @@ out:
 	return status;
 }
 
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_release_lockowner(&xdr);
+	return status;
+}
+
 /*
  * Decode READLINK response
  */
@@ -5696,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
 		status = decode_reclaim_complete(&xdr, (void *)NULL);
 	return status;
 }
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+				      struct nfs4_getdeviceinfo_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	if (status != 0)
+		goto out;
+	status = decode_getdeviceinfo(&xdr, res->pdev);
+out:
+	return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+				  struct nfs4_layoutget_res *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status)
+		goto out;
+	status = decode_layoutget(&xdr, rqstp, res);
+out:
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
-__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+			   struct nfs_server *server, int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
-
-	if (!*p++) {
-		if (!*p)
+	__be32 *p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	if (!ntohl(*p++)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (!ntohl(*p++))
 			return ERR_PTR(-EAGAIN);
 		entry->eof = 1;
 		return ERR_PTR(-EBADCOOKIE);
 	}
 
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->prev_cookie = entry->cookie;
 	p = xdr_decode_hyper(p, &entry->cookie);
 	entry->len = ntohl(*p++);
+
+	p = xdr_inline_decode(xdr, entry->len);
+	if (unlikely(!p))
+		goto out_overflow;
 	entry->name = (const char *) p;
-	p += XDR_QUADLEN(entry->len);
 
 	/*
 	 * In case the server doesn't return an inode number,
@@ -5722,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 	 * since glibc seems to choke on it...)
 	 */
 	entry->ino = 1;
+	entry->fattr->valid = 0;
 
-	len = ntohl(*p++);		/* bitmap length */
-	if (len-- > 0) {
-		bitmap[0] = ntohl(*p++);
-		if (len-- > 0) {
-			bitmap[1] = ntohl(*p++);
-			p += len;
-		}
-	}
-	len = XDR_QUADLEN(ntohl(*p++));	/* attribute buffer length */
-	if (len > 0) {
-		if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
-			bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
-			/* Ignore the return value of rdattr_error for now */
-			p++;
-			len--;
-		}
-		if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
-			xdr_decode_hyper(p, &entry->ino);
-		else if (bitmap[0] == FATTR4_WORD0_FILEID)
-			xdr_decode_hyper(p, &entry->ino);
-		p += len;
-	}
+	if (decode_attr_bitmap(xdr, bitmap) < 0)
+		goto out_overflow;
+
+	if (decode_attr_length(xdr, &len, &p) < 0)
+		goto out_overflow;
+
+	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
+		goto out_overflow;
+	if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+		entry->ino = entry->fattr->fileid;
+
+	if (verify_attr_len(xdr, p, len) < 0)
+		goto out_overflow;
+
+	p = xdr_inline_peek(xdr, 8);
+	if (p != NULL)
+		entry->eof = !p[0] && p[1];
+	else
+		entry->eof = 0;
 
-	entry->eof = !p[0] && p[1];
 	return p;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return ERR_PTR(-EIO);
 }
 
 /*
@@ -5866,6 +6340,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
+  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
   PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
@@ -5873,6 +6348,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(SEQUENCE,	enc_sequence,	dec_sequence),
   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
   PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546..903908a2002 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
  *
  *  Allow an NFS filesystem to be mounted as root. The way this works is:
  *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
- *     (2) Handle RPC negotiation with the system which replied to RARP or
- *         was reported as a boot server by BOOTP or manually.
- *     (3) The actual mounting is done later, when init() is running.
+ *     (2) Construct the device string and the options string using DHCP
+ *         option 17 and/or kernel command line options.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
  *
  *
  *	Changes:
@@ -65,470 +66,245 @@
  *	Hua Qin		:	Support for mounting root file system via
  *				NFS over TCP.
  *	Fabian Frederick:	Option parser rebuilt (using parser lib)
-*/
+ *	Chuck Lever	:	Use super.c's text-based mount option parsing
+ *	Chuck Lever	:	Add "nfsrootdebug".
+ */
 
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_mount.h>
-#include <linux/in.h>
-#include <linux/major.h>
 #include <linux/utsname.h>
-#include <linux/inet.h>
 #include <linux/root_dev.h>
 #include <net/ipconfig.h>
-#include <linux/parser.h>
 
 #include "internal.h"
 
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
 
-/* Default port to use if server is not running a portmapper */
-#define NFS_MNT_PORT	627
-
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
 /* Parameters passed from the kernel command line */
-static char nfs_root_name[256] __initdata = "";
+static char nfs_root_parms[256] __initdata = "";
+
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = "";
 
 /* Address of NFS server */
-static __be32 servaddr __initdata = 0;
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
 
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
-
-/* NFS-related data */
-static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
-static int nfs_port __initdata = 0;		/* Port to connect to for NFS */
-static int mount_port __initdata = 0;		/* Mount daemon port number */
-
-
-/***************************************************************************
-
-			     Parsing of options
-
- ***************************************************************************/
-
-enum {
-	/* Options that take integer arguments */
-	Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-	Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
-	/* Options that take no arguments */
-	Opt_soft, Opt_hard, Opt_intr,
-	Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
-	Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
-	Opt_acl, Opt_noacl,
-	/* Error token */
-	Opt_err
-};
-
-static const match_table_t tokens __initconst = {
-	{Opt_port, "port=%u"},
-	{Opt_rsize, "rsize=%u"},
-	{Opt_wsize, "wsize=%u"},
-	{Opt_timeo, "timeo=%u"},
-	{Opt_retrans, "retrans=%u"},
-	{Opt_acregmin, "acregmin=%u"},
-	{Opt_acregmax, "acregmax=%u"},
-	{Opt_acdirmin, "acdirmin=%u"},
-	{Opt_acdirmax, "acdirmax=%u"},
-	{Opt_soft, "soft"},
-	{Opt_hard, "hard"},
-	{Opt_intr, "intr"},
-	{Opt_nointr, "nointr"},
-	{Opt_posix, "posix"},
-	{Opt_noposix, "noposix"},
-	{Opt_cto, "cto"},
-	{Opt_nocto, "nocto"},
-	{Opt_ac, "ac"},
-	{Opt_noac, "noac"},
-	{Opt_lock, "lock"},
-	{Opt_nolock, "nolock"},
-	{Opt_v2, "nfsvers=2"},
-	{Opt_v2, "v2"},
-	{Opt_v3, "nfsvers=3"},
-	{Opt_v3, "v3"},
-	{Opt_udp, "proto=udp"},
-	{Opt_udp, "udp"},
-	{Opt_tcp, "proto=tcp"},
-	{Opt_tcp, "tcp"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_err, NULL}
-	
-};
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
 
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+
+#ifdef RPC_DEBUG
 /*
- *  Parse option string.
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
  */
-
-static int __init root_nfs_parse(char *name, char *buf)
+static int __init nfs_root_debug(char *__unused)
 {
-
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-
-	if (!name)
-		return 1;
-
-	/* Set the NFS remote path */
-	p = strsep(&name, ",");
-	if (p[0] != '\0' && strcmp(p, "default") != 0)
-		strlcpy(buf, p, NFS_MAXPATHLEN);
-
-	while ((p = strsep (&name, ",")) != NULL) {
-		int token; 
-		if (!*p)
-			continue;
-		token = match_token(p, tokens, args);
-
-		/* %u tokens only. Beware if you add new tokens! */
-		if (token < Opt_soft && match_int(&args[0], &option))
-			return 0;
-		switch (token) {
-			case Opt_port:
-				nfs_port = option;
-				break;
-			case Opt_rsize:
-				nfs_data.rsize = option;
-				break;
-			case Opt_wsize:
-				nfs_data.wsize = option;
-				break;
-			case Opt_timeo:
-				nfs_data.timeo = option;
-				break;
-			case Opt_retrans:
-				nfs_data.retrans = option;
-				break;
-			case Opt_acregmin:
-				nfs_data.acregmin = option;
-				break;
-			case Opt_acregmax:
-				nfs_data.acregmax = option;
-				break;
-			case Opt_acdirmin:
-				nfs_data.acdirmin = option;
-				break;
-			case Opt_acdirmax:
-				nfs_data.acdirmax = option;
-				break;
-			case Opt_soft:
-				nfs_data.flags |= NFS_MOUNT_SOFT;
-				break;
-			case Opt_hard:
-				nfs_data.flags &= ~NFS_MOUNT_SOFT;
-				break;
-			case Opt_intr:
-			case Opt_nointr:
-				break;
-			case Opt_posix:
-				nfs_data.flags |= NFS_MOUNT_POSIX;
-				break;
-			case Opt_noposix:
-				nfs_data.flags &= ~NFS_MOUNT_POSIX;
-				break;
-			case Opt_cto:
-				nfs_data.flags &= ~NFS_MOUNT_NOCTO;
-				break;
-			case Opt_nocto:
-				nfs_data.flags |= NFS_MOUNT_NOCTO;
-				break;
-			case Opt_ac:
-				nfs_data.flags &= ~NFS_MOUNT_NOAC;
-				break;
-			case Opt_noac:
-				nfs_data.flags |= NFS_MOUNT_NOAC;
-				break;
-			case Opt_lock:
-				nfs_data.flags &= ~NFS_MOUNT_NONLM;
-				break;
-			case Opt_nolock:
-				nfs_data.flags |= NFS_MOUNT_NONLM;
-				break;
-			case Opt_v2:
-				nfs_data.flags &= ~NFS_MOUNT_VER3;
-				break;
-			case Opt_v3:
-				nfs_data.flags |= NFS_MOUNT_VER3;
-				break;
-			case Opt_udp:
-				nfs_data.flags &= ~NFS_MOUNT_TCP;
-				break;
-			case Opt_tcp:
-				nfs_data.flags |= NFS_MOUNT_TCP;
-				break;
-			case Opt_acl:
-				nfs_data.flags &= ~NFS_MOUNT_NOACL;
-				break;
-			case Opt_noacl:
-				nfs_data.flags |= NFS_MOUNT_NOACL;
-				break;
-			default:
-				printk(KERN_WARNING "Root-NFS: unknown "
-					"option: %s\n", p);
-				return 0;
-		}
-	}
-
+	nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
 	return 1;
 }
 
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
+
 /*
- *  Prepare the NFS data structure and parse all options.
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
  */
-static int __init root_nfs_name(char *name)
+static int __init nfs_root_setup(char *line)
 {
-	static char buf[NFS_MAXPATHLEN] __initdata;
-	char *cp;
-
-	/* Set some default values */
-	memset(&nfs_data, 0, sizeof(nfs_data));
-	nfs_port          = -1;
-	nfs_data.version  = NFS_MOUNT_VERSION;
-	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
-	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.acregmin = NFS_DEF_ACREGMIN;
-	nfs_data.acregmax = NFS_DEF_ACREGMAX;
-	nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-	nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
-	strcpy(buf, NFS_ROOT);
-
-	/* Process options received from the remote server */
-	root_nfs_parse(root_server_path, buf);
-
-	/* Override them by options set on kernel command-line */
-	root_nfs_parse(name, buf);
-
-	cp = utsname()->nodename;
-	if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
-		printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
-		return -1;
+	ROOT_DEV = Root_NFS;
+
+	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+		strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+	} else {
+		size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+		if (n >= sizeof(nfs_root_parms))
+			line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+		sprintf(nfs_root_parms, NFS_ROOT, line);
 	}
-	sprintf(nfs_export_path, buf, cp);
+
+	/*
+	 * Extract the IP address of the NFS server containing our
+	 * root file system, if one was specified.
+	 *
+	 * Note: root_nfs_parse_addr() removes the server-ip from
+	 *	 nfs_root_parms, if it exists.
+	 */
+	root_server_addr = root_nfs_parse_addr(nfs_root_parms);
 
 	return 1;
 }
 
+__setup("nfsroot=", nfs_root_setup);
 
-/*
- *  Get NFS server address.
- */
-static int __init root_nfs_addr(void)
+static int __init root_nfs_copy(char *dest, const char *src,
+				     const size_t destlen)
 {
-	if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) {
-		printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
+	if (strlcpy(dest, src, destlen) > destlen)
 		return -1;
-	}
+	return 0;
+}
 
-	snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
-		 "%pI4", &servaddr);
+static int __init root_nfs_cat(char *dest, const char *src,
+				  const size_t destlen)
+{
+	if (strlcat(dest, src, destlen) > destlen)
+		return -1;
 	return 0;
 }
 
 /*
- *  Tell the user what's going on.
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
  */
-#ifdef NFSROOT_DEBUG
-static void __init root_nfs_print(void)
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+					 const size_t exppathlen)
 {
-	printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
-		nfs_export_path, nfs_data.hostname);
-	printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
-		nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
-	printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
-		nfs_data.acregmin, nfs_data.acregmax,
-		nfs_data.acdirmin, nfs_data.acdirmax);
-	printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
-		nfs_port, mount_port, nfs_data.flags);
-}
-#endif
-
+	char *p;
 
-static int __init root_nfs_init(void)
-{
-#ifdef NFSROOT_DEBUG
-	nfs_debug |= NFSDBG_ROOT;
-#endif
+	/*
+	 * Set the NFS remote path
+	 */
+	p = strsep(&incoming, ",");
+	if (*p != '\0' && strcmp(p, "default") != 0)
+		if (root_nfs_copy(exppath, p, exppathlen))
+			return -1;
 
 	/*
-	 * Decode the root directory path name and NFS options from
-	 * the kernel command line. This has to go here in order to
-	 * be able to use the client IP address for the remote root
-	 * directory (necessary for pure RARP booting).
+	 * @incoming now points to the rest of the string; if it
+	 * contains something, append it to our root options buffer
 	 */
-	if (root_nfs_name(nfs_root_name) < 0 ||
-	    root_nfs_addr() < 0)
-		return -1;
+	if (incoming != NULL && *incoming != '\0')
+		if (root_nfs_cat(nfs_root_options, incoming,
+						sizeof(nfs_root_options)))
+			return -1;
 
-#ifdef NFSROOT_DEBUG
-	root_nfs_print();
-#endif
+	/*
+	 * Possibly prepare for more options to be appended
+	 */
+	if (nfs_root_options[0] != '\0' &&
+	    nfs_root_options[strlen(nfs_root_options)] != ',')
+		if (root_nfs_cat(nfs_root_options, ",",
+						sizeof(nfs_root_options)))
+			return -1;
 
 	return 0;
 }
 
-
 /*
- *  Parse NFS server and directory information passed on the kernel
- *  command line.
+ *  Decode the export directory path name and NFS options from
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
  */
-static int __init nfs_root_setup(char *line)
+static int __init root_nfs_data(char *cmdline)
 {
-	ROOT_DEV = Root_NFS;
-	if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
-		strlcpy(nfs_root_name, line, sizeof(nfs_root_name));
-	} else {
-		int n = strlen(line) + sizeof(NFS_ROOT) - 1;
-		if (n >= sizeof(nfs_root_name))
-			line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0';
-		sprintf(nfs_root_name, NFS_ROOT, line);
+	char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	int len, retval = -1;
+	char *tmp = NULL;
+	const size_t tmplen = sizeof(nfs_export_path);
+
+	tmp = kzalloc(tmplen, GFP_KERNEL);
+	if (tmp == NULL)
+		goto out_nomem;
+	strcpy(tmp, NFS_ROOT);
+
+	if (root_server_path[0] != '\0') {
+		dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+			root_server_path);
+		if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+			goto out_optionstoolong;
 	}
-	root_server_addr = root_nfs_parse_addr(nfs_root_name);
-	return 1;
-}
-
-__setup("nfsroot=", nfs_root_setup);
-
-/***************************************************************************
 
-	       Routines to actually mount the root directory
+	if (cmdline[0] != '\0') {
+		dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+		if (root_nfs_parse_options(cmdline, tmp, tmplen))
+			goto out_optionstoolong;
+	}
 
- ***************************************************************************/
+	/*
+	 * Append mandatory options for nfsroot so they override
+	 * what has come before
+	 */
+	snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+			&servaddr);
+	if (root_nfs_cat(nfs_root_options, addr_option,
+						sizeof(nfs_root_options)))
+		goto out_optionstoolong;
 
-/*
- *  Construct sockaddr_in from address and port number.
- */
-static inline void
-set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
-{
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = addr;
-	sin->sin_port = port;
-}
+	/*
+	 * Set up nfs_root_device.  For NFS mounts, this looks like
+	 *
+	 *	server:/path
+	 *
+	 * At this point, utsname()->nodename contains our local
+	 * IP address or hostname, set by ipconfig.  If "%s" exists
+	 * in tmp, substitute the nodename, then shovel the whole
+	 * mess into nfs_root_device.
+	 */
+	len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+				tmp, utsname()->nodename);
+	if (len > (int)sizeof(nfs_export_path))
+		goto out_devnametoolong;
+	len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+				"%pI4:%s", &servaddr, nfs_export_path);
+	if (len > (int)sizeof(nfs_root_device))
+		goto out_devnametoolong;
 
-/*
- *  Query server portmapper for the port of a daemon program.
- */
-static int __init root_nfs_getport(int program, int version, int proto)
-{
-	struct sockaddr_in sin;
+	retval = 0;
 
-	printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
-		program, version, &servaddr);
-	set_sockaddr(&sin, servaddr, 0);
-	return rpcb_getport_sync(&sin, program, version, proto);
+out:
+	kfree(tmp);
+	return retval;
+out_nomem:
+	printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+	goto out;
+out_optionstoolong:
+	printk(KERN_ERR "Root-NFS: mount options string too long\n");
+	goto out;
+out_devnametoolong:
+	printk(KERN_ERR "Root-NFS: root device name too long.\n");
+	goto out;
 }
 
-
-/*
- *  Use portmapper to find mountd and nfsd port numbers if not overriden
- *  by the user. Use defaults if portmapper is not available.
- *  XXX: Is there any nfs server with no portmapper?
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
  */
-static int __init root_nfs_ports(void)
+int __init nfs_root_data(char **root_device, char **root_data)
 {
-	int port;
-	int nfsd_ver, mountd_ver;
-	int nfsd_port, mountd_port;
-	int proto;
-
-	if (nfs_data.flags & NFS_MOUNT_VER3) {
-		nfsd_ver = NFS3_VERSION;
-		mountd_ver = NFS_MNT3_VERSION;
-		nfsd_port = NFS_PORT;
-		mountd_port = NFS_MNT_PORT;
-	} else {
-		nfsd_ver = NFS2_VERSION;
-		mountd_ver = NFS_MNT_VERSION;
-		nfsd_port = NFS_PORT;
-		mountd_port = NFS_MNT_PORT;
-	}
-
-	proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-
-	if (nfs_port < 0) {
-		if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
-			printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
-					"number from server, using default\n");
-			port = nfsd_port;
-		}
-		nfs_port = port;
-		dprintk("Root-NFS: Portmapper on server returned %d "
-			"as nfsd port\n", port);
+	servaddr = root_server_addr;
+	if (servaddr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-NFS: no NFS server address\n");
+		return -1;
 	}
 
-	if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
-		printk(KERN_ERR "Root-NFS: Unable to get mountd port "
-				"number from server, using default\n");
-		port = mountd_port;
-	}
-	mount_port = port;
-	dprintk("Root-NFS: mountd port is %d\n", port);
+	if (root_nfs_data(nfs_root_parms) < 0)
+		return -1;
 
+	*root_device = nfs_root_device;
+	*root_data = nfs_root_options;
 	return 0;
 }
-
-
-/*
- *  Get a file handle from the server for the directory which is to be
- *  mounted.
- */
-static int __init root_nfs_get_handle(void)
-{
-	struct sockaddr_in sin;
-	unsigned int auth_flav_len = 0;
-	struct nfs_mount_request request = {
-		.sap		= (struct sockaddr *)&sin,
-		.salen		= sizeof(sin),
-		.dirpath	= nfs_export_path,
-		.version	= (nfs_data.flags & NFS_MOUNT_VER3) ?
-					NFS_MNT3_VERSION : NFS_MNT_VERSION,
-		.protocol	= (nfs_data.flags & NFS_MOUNT_TCP) ?
-					XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-		.auth_flav_len	= &auth_flav_len,
-	};
-	int status = -ENOMEM;
-
-	request.fh = nfs_alloc_fhandle();
-	if (!request.fh)
-		goto out;
-	set_sockaddr(&sin, servaddr, htons(mount_port));
-	status = nfs_mount(&request);
-	if (status < 0)
-		printk(KERN_ERR "Root-NFS: Server returned error %d "
-				"while mounting %s\n", status, nfs_export_path);
-	else {
-		nfs_data.root.size = request.fh->size;
-		memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
-	}
-	nfs_free_fhandle(request.fh);
-out:
-	return status;
-}
-
-/*
- *  Get the NFS port numbers and file handle, and return the prepared 'data'
- *  argument for mount() if everything went OK. Return NULL otherwise.
- */
-void * __init nfs_root_data(void)
-{
-	if (root_nfs_init() < 0
-	 || root_nfs_ports() < 0
-	 || root_nfs_get_handle() < 0)
-		return NULL;
-	set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
-	return (void*)&nfs_data;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b58..137b549e63d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	if (req == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	/* get lock context early so we can deal with alloc failures */
+	req->wb_lock_context = nfs_get_lock_context(ctx);
+	if (req->wb_lock_context == NULL) {
+		nfs_page_free(req);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
 	 * update_nfs_request below if the region is not locked. */
@@ -141,11 +148,16 @@ void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	struct nfs_open_context *ctx = req->wb_context;
+	struct nfs_lock_context *l_ctx = req->wb_lock_context;
 
 	if (page != NULL) {
 		page_cache_release(page);
 		req->wb_page = NULL;
 	}
+	if (l_ctx != NULL) {
+		nfs_put_lock_context(l_ctx);
+		req->wb_lock_context = NULL;
+	}
 	if (ctx != NULL) {
 		put_nfs_open_context(ctx);
 		req->wb_context = NULL;
@@ -235,7 +247,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
-	if (req->wb_context->lockowner != prev->wb_context->lockowner)
+	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
 		return 0;
 	if (req->wb_context->state != prev->wb_context->state)
 		return 0;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 00000000000..db773428f95
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+		if (local->id == id)
+			goto out;
+	local = NULL;
+out:
+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+	return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+	struct pnfs_layoutdriver_type *local;
+
+	spin_lock(&pnfs_spinlock);
+	local = find_pnfs_driver_locked(id);
+	spin_unlock(&pnfs_spinlock);
+	return local;
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+	if (nfss->pnfs_curr_ld) {
+		nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+		module_put(nfss->pnfs_curr_ld->owner);
+	}
+	nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+	struct pnfs_layoutdriver_type *ld_type = NULL;
+
+	if (id == 0)
+		goto out_no_driver;
+	if (!(server->nfs_client->cl_exchange_flags &
+		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+		       id, server->nfs_client->cl_exchange_flags);
+		goto out_no_driver;
+	}
+	ld_type = find_pnfs_driver(id);
+	if (!ld_type) {
+		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+		ld_type = find_pnfs_driver(id);
+		if (!ld_type) {
+			dprintk("%s: No pNFS module found for %u.\n",
+				__func__, id);
+			goto out_no_driver;
+		}
+	}
+	if (!try_module_get(ld_type->owner)) {
+		dprintk("%s: Could not grab reference on module\n", __func__);
+		goto out_no_driver;
+	}
+	server->pnfs_curr_ld = ld_type;
+	if (ld_type->set_layoutdriver(server)) {
+		printk(KERN_ERR
+		       "%s: Error initializing mount point for layout driver %u.\n",
+		       __func__, id);
+		module_put(ld_type->owner);
+		goto out_no_driver;
+	}
+	dprintk("%s: pNFS module for %u set\n", __func__, id);
+	return;
+
+out_no_driver:
+	dprintk("%s: Using NFSv4 I/O\n", __func__);
+	server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	int status = -EINVAL;
+	struct pnfs_layoutdriver_type *tmp;
+
+	if (ld_type->id == 0) {
+		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+		return status;
+	}
+	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+		printk(KERN_ERR "%s Layout driver must provide "
+		       "alloc_lseg and free_lseg.\n", __func__);
+		return status;
+	}
+
+	spin_lock(&pnfs_spinlock);
+	tmp = find_pnfs_driver_locked(ld_type->id);
+	if (!tmp) {
+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+		status = 0;
+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+			ld_type->name);
+	} else {
+		printk(KERN_ERR "%s Module with id %d already loaded!\n",
+			__func__, ld_type->id);
+	}
+	spin_unlock(&pnfs_spinlock);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+	spin_lock(&pnfs_spinlock);
+	list_del(&ld_type->pnfs_tblid);
+	spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+static void
+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+	assert_spin_locked(&lo->inode->i_lock);
+	lo->refcount++;
+}
+
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+	assert_spin_locked(&lo->inode->i_lock);
+	BUG_ON(lo->refcount == 0);
+
+	lo->refcount--;
+	if (!lo->refcount) {
+		dprintk("%s: freeing layout cache %p\n", __func__, lo);
+		BUG_ON(!list_empty(&lo->layouts));
+		NFS_I(lo->inode)->layout = NULL;
+		kfree(lo);
+	}
+}
+
+void
+put_layout_hdr(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	put_layout_hdr_locked(NFS_I(inode)->layout);
+	spin_unlock(&inode->i_lock);
+}
+
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+	INIT_LIST_HEAD(&lseg->fi_list);
+	kref_init(&lseg->kref);
+	lseg->layout = lo;
+}
+
+/* Called without i_lock held, as the free_lseg call may sleep */
+static void
+destroy_lseg(struct kref *kref)
+{
+	struct pnfs_layout_segment *lseg =
+		container_of(kref, struct pnfs_layout_segment, kref);
+	struct inode *ino = lseg->layout->inode;
+
+	dprintk("--> %s\n", __func__);
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+	put_layout_hdr(ino);
+}
+
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (!lseg)
+		return;
+
+	dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+		atomic_read(&lseg->kref.refcount));
+	kref_put(&lseg->kref, destroy_lseg);
+}
+
+static void
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+{
+	struct pnfs_layout_segment *lseg, *next;
+	struct nfs_client *clp;
+
+	dprintk("%s:Begin lo %p\n", __func__, lo);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+		dprintk("%s: freeing lseg %p\n", __func__, lseg);
+		list_move(&lseg->fi_list, tmp_list);
+	}
+	clp = NFS_SERVER(lo->inode)->nfs_client;
+	spin_lock(&clp->cl_lock);
+	/* List does not take a reference, so no need for put here */
+	list_del_init(&lo->layouts);
+	spin_unlock(&clp->cl_lock);
+	write_seqlock(&lo->seqlock);
+	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+	write_sequnlock(&lo->seqlock);
+
+	dprintk("%s:Return\n", __func__);
+}
+
+static void
+pnfs_free_lseg_list(struct list_head *tmp_list)
+{
+	struct pnfs_layout_segment *lseg;
+
+	while (!list_empty(tmp_list)) {
+		lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+				fi_list);
+		dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+		list_del(&lseg->fi_list);
+		put_lseg(lseg);
+	}
+}
+
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	lo = nfsi->layout;
+	if (lo) {
+		pnfs_clear_lseg_list(lo, &tmp_list);
+		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
+		put_layout_hdr_locked(lo);
+	}
+	spin_unlock(&nfsi->vfs_inode.i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	struct pnfs_layout_hdr *lo;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&clp->cl_lock);
+	list_splice_init(&clp->cl_layouts, &tmp_list);
+	spin_unlock(&clp->cl_lock);
+
+	while (!list_empty(&tmp_list)) {
+		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+				layouts);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->inode->i_ino);
+		pnfs_destroy_layout(NFS_I(lo->inode));
+	}
+}
+
+/* update lo->stateid with new if is more recent
+ *
+ * lo->stateid could be the open stateid, in which case we just use what given.
+ */
+static void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+			const nfs4_stateid *new)
+{
+	nfs4_stateid *old = &lo->stateid;
+	bool overwrite = false;
+
+	write_seqlock(&lo->seqlock);
+	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
+		overwrite = true;
+	else {
+		u32 oldseq, newseq;
+
+		oldseq = be32_to_cpu(old->stateid.seqid);
+		newseq = be32_to_cpu(new->stateid.seqid);
+		if ((int)(newseq - oldseq) > 0)
+			overwrite = true;
+	}
+	if (overwrite)
+		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
+	write_sequnlock(&lo->seqlock);
+}
+
+static void
+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+			      struct nfs4_state *state)
+{
+	int seq;
+
+	dprintk("--> %s\n", __func__);
+	write_seqlock(&lo->seqlock);
+	do {
+		seq = read_seqbegin(&state->seqlock);
+		memcpy(lo->stateid.data, state->stateid.data,
+		       sizeof(state->stateid.data));
+	} while (read_seqretry(&state->seqlock, seq));
+	set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+	write_sequnlock(&lo->seqlock);
+	dprintk("<-- %s\n", __func__);
+}
+
+void
+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			struct nfs4_state *open_state)
+{
+	int seq;
+
+	dprintk("--> %s\n", __func__);
+	do {
+		seq = read_seqbegin(&lo->seqlock);
+		if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+			/* This will trigger retry of the read */
+			pnfs_layout_from_open_stateid(lo, open_state);
+		} else
+			memcpy(dst->data, lo->stateid.data,
+			       sizeof(lo->stateid.data));
+	} while (read_seqretry(&lo->seqlock, seq));
+	dprintk("<-- %s\n", __func__);
+}
+
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+	   struct nfs_open_context *ctx,
+	   u32 iomode)
+{
+	struct inode *ino = lo->inode;
+	struct nfs_server *server = NFS_SERVER(ino);
+	struct nfs4_layoutget *lgp;
+	struct pnfs_layout_segment *lseg = NULL;
+
+	dprintk("--> %s\n", __func__);
+
+	BUG_ON(ctx == NULL);
+	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+	if (lgp == NULL) {
+		put_layout_hdr(lo->inode);
+		return NULL;
+	}
+	lgp->args.minlength = NFS4_MAX_UINT64;
+	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+	lgp->args.range.iomode = iomode;
+	lgp->args.range.offset = 0;
+	lgp->args.range.length = NFS4_MAX_UINT64;
+	lgp->args.type = server->pnfs_curr_ld->id;
+	lgp->args.inode = ino;
+	lgp->args.ctx = get_nfs_open_context(ctx);
+	lgp->lsegpp = &lseg;
+
+	/* Synchronously retrieve layout information from server and
+	 * store in lseg.
+	 */
+	nfs4_proc_layoutget(lgp);
+	if (!lseg) {
+		/* remember that LAYOUTGET failed and suspend trying */
+		set_bit(lo_fail_bit(iomode), &lo->state);
+	}
+	return lseg;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+	/* read > read/write */
+	return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+		   struct pnfs_layout_segment *lseg)
+{
+	struct pnfs_layout_segment *lp;
+	int found = 0;
+
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	if (list_empty(&lo->segs)) {
+		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+
+		spin_lock(&clp->cl_lock);
+		BUG_ON(!list_empty(&lo->layouts));
+		list_add_tail(&lo->layouts, &clp->cl_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
+	list_for_each_entry(lp, &lo->segs, fi_list) {
+		if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+			continue;
+		list_add_tail(&lseg->fi_list, &lp->fi_list);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu before "
+			"lp %p iomode %d offset %llu length %llu\n",
+			__func__, lseg, lseg->range.iomode,
+			lseg->range.offset, lseg->range.length,
+			lp, lp->range.iomode, lp->range.offset,
+			lp->range.length);
+		found = 1;
+		break;
+	}
+	if (!found) {
+		list_add_tail(&lseg->fi_list, &lo->segs);
+		dprintk("%s: inserted lseg %p "
+			"iomode %d offset %llu length %llu at tail\n",
+			__func__, lseg, lseg->range.iomode,
+			lseg->range.offset, lseg->range.length);
+	}
+	get_layout_hdr_locked(lo);
+
+	dprintk("%s:Return\n", __func__);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo;
+
+	lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+	if (!lo)
+		return NULL;
+	lo->refcount = 1;
+	INIT_LIST_HEAD(&lo->layouts);
+	INIT_LIST_HEAD(&lo->segs);
+	seqlock_init(&lo->seqlock);
+	lo->inode = ino;
+	return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *new = NULL;
+
+	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+	assert_spin_locked(&ino->i_lock);
+	if (nfsi->layout)
+		return nfsi->layout;
+
+	spin_unlock(&ino->i_lock);
+	new = alloc_init_layout_hdr(ino);
+	spin_lock(&ino->i_lock);
+
+	if (likely(nfsi->layout == NULL))	/* Won the race? */
+		nfsi->layout = new;
+	else
+		kfree(new);
+	return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode	lseg	match
+ * -----	-----	-----
+ * ANY		READ	true
+ * ANY		RW	true
+ * RW		READ	false
+ * RW		RW	true
+ * READ		READ	true
+ * READ		RW	true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+	return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+	struct pnfs_layout_segment *lseg, *ret = NULL;
+
+	dprintk("%s:Begin\n", __func__);
+
+	assert_spin_locked(&lo->inode->i_lock);
+	list_for_each_entry(lseg, &lo->segs, fi_list) {
+		if (is_matching_lseg(lseg, iomode)) {
+			ret = lseg;
+			break;
+		}
+		if (cmp_layout(iomode, lseg->range.iomode) > 0)
+			break;
+	}
+
+	dprintk("%s:Return lseg %p ref %d\n",
+		__func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+	return ret;
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+		   struct nfs_open_context *ctx,
+		   enum pnfs_iomode iomode)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_segment *lseg = NULL;
+
+	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+		return NULL;
+	spin_lock(&ino->i_lock);
+	lo = pnfs_find_alloc_layout(ino);
+	if (lo == NULL) {
+		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+		goto out_unlock;
+	}
+
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_has_layout(lo, iomode);
+	if (lseg) {
+		dprintk("%s: Using cached lseg %p for iomode %d)\n",
+			__func__, lseg, iomode);
+		goto out_unlock;
+	}
+
+	/* if LAYOUTGET already failed once we don't try again */
+	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+		goto out_unlock;
+
+	get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+	spin_unlock(&ino->i_lock);
+
+	lseg = send_layoutget(lo, ctx, iomode);
+out:
+	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+		nfsi->layout->state, lseg);
+	return lseg;
+out_unlock:
+	spin_unlock(&ino->i_lock);
+	goto out;
+}
+
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+	struct nfs4_layoutget_res *res = &lgp->res;
+	struct pnfs_layout_segment *lseg;
+	struct inode *ino = lo->inode;
+	int status = 0;
+
+	/* Inject layout blob into I/O device driver */
+	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+	if (!lseg || IS_ERR(lseg)) {
+		if (!lseg)
+			status = -ENOMEM;
+		else
+			status = PTR_ERR(lseg);
+		dprintk("%s: Could not allocate layout: error %d\n",
+		       __func__, status);
+		goto out;
+	}
+
+	spin_lock(&ino->i_lock);
+	init_lseg(lo, lseg);
+	lseg->range = res->range;
+	*lgp->lsegpp = lseg;
+	pnfs_insert_layout(lo, lseg);
+
+	/* Done processing layoutget. Set the layout stateid */
+	pnfs_set_layout_stateid(lo, &res->stateid);
+	spin_unlock(&ino->i_lock);
+out:
+	return status;
+}
+
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+			 void (*free_callback)(struct pnfs_deviceid_node *))
+{
+	struct pnfs_deviceid_cache *c;
+
+	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_devid_cache != NULL) {
+		atomic_inc(&clp->cl_devid_cache->dc_ref);
+		dprintk("%s [kref [%d]]\n", __func__,
+			atomic_read(&clp->cl_devid_cache->dc_ref));
+		kfree(c);
+	} else {
+		/* kzalloc initializes hlists */
+		spin_lock_init(&c->dc_lock);
+		atomic_set(&c->dc_ref, 1);
+		c->dc_free_callback = free_callback;
+		clp->cl_devid_cache = c;
+		dprintk("%s [new]\n", __func__);
+	}
+	spin_unlock(&clp->cl_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+		  struct pnfs_deviceid_node *devid)
+{
+	struct nfs4_deviceid *id = &devid->de_id;
+	struct pnfs_deviceid_node *d;
+	struct hlist_node *n;
+	long h = nfs4_deviceid_hash(id);
+
+	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+		return;
+
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
+			hlist_del_rcu(&d->de_node);
+			spin_unlock(&c->dc_lock);
+			synchronize_rcu();
+			c->dc_free_callback(devid);
+			return;
+		}
+	spin_unlock(&c->dc_lock);
+	/* Why wasn't it found in  the list? */
+	BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+	struct pnfs_deviceid_node *d;
+	struct hlist_node *n;
+	long hash = nfs4_deviceid_hash(id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
+			if (!atomic_inc_not_zero(&d->de_ref)) {
+				goto fail;
+			} else {
+				rcu_read_unlock();
+				return d;
+			}
+		}
+	}
+fail:
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+	struct pnfs_deviceid_node *d;
+	long hash = nfs4_deviceid_hash(&new->de_id);
+
+	dprintk("--> %s hash %ld\n", __func__, hash);
+	spin_lock(&c->dc_lock);
+	d = pnfs_find_get_deviceid(c, &new->de_id);
+	if (d) {
+		spin_unlock(&c->dc_lock);
+		dprintk("%s [discard]\n", __func__);
+		c->dc_free_callback(new);
+		return d;
+	}
+	INIT_HLIST_NODE(&new->de_node);
+	atomic_set(&new->de_ref, 1);
+	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+	spin_unlock(&c->dc_lock);
+	dprintk("%s [new]\n", __func__);
+	return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+
+	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+		int i;
+		/* Verify cache is empty */
+		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+			BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+		clp->cl_devid_cache = NULL;
+		spin_unlock(&clp->cl_lock);
+		kfree(local);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 00000000000..e12367d5048
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+struct pnfs_layout_segment {
+	struct list_head fi_list;
+	struct pnfs_layout_range range;
+	struct kref kref;
+	struct pnfs_layout_hdr *layout;
+};
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+enum {
+	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
+	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
+};
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+	struct list_head pnfs_tblid;
+	const u32 id;
+	const char *name;
+	struct module *owner;
+	int (*set_layoutdriver) (struct nfs_server *);
+	int (*clear_layoutdriver) (struct nfs_server *);
+	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+};
+
+struct pnfs_layout_hdr {
+	unsigned long		refcount;
+	struct list_head	layouts;   /* other client layouts */
+	struct list_head	segs;      /* layout segments list */
+	seqlock_t		seqlock;   /* Protects the stateid */
+	nfs4_stateid		stateid;
+	unsigned long		state;
+	struct inode		*inode;
+};
+
+struct pnfs_device {
+	struct nfs4_deviceid dev_id;
+	unsigned int  layout_type;
+	unsigned int  mincount;
+	struct page **pages;
+	void          *area;
+	unsigned int  pgbase;
+	unsigned int  pglen;
+};
+
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+struct pnfs_deviceid_node {
+	struct hlist_node	de_node;
+	struct nfs4_deviceid	de_id;
+	atomic_t		de_ref;
+};
+
+struct pnfs_deviceid_cache {
+	spinlock_t		dc_lock;
+	atomic_t		dc_ref;
+	void			(*dc_free_callback)(struct pnfs_deviceid_node *);
+	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+			void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+				struct pnfs_deviceid_cache *,
+				struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+				struct pnfs_deviceid_cache *,
+				struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+			      struct pnfs_deviceid_node *devid);
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+				   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+
+/* pnfs.c */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   enum pnfs_iomode access_type);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct inode *inode);
+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			     struct nfs4_state *open_state);
+
+
+static inline int lo_fail_bit(u32 iomode)
+{
+	return iomode == IOMODE_RW ?
+			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+	return nfss->pnfs_curr_ld != NULL;
+}
+
+#else  /* CONFIG_NFS_V4_1 */
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+		   enum pnfs_iomode access_type)
+{
+	return NULL;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f55..58e7f84fc1f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
 
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-		int flags, struct nameidata *nd)
+		int flags, struct nfs_open_context *ctx)
 {
 	struct nfs_createdata *data;
 	struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	return 1;
 }
 
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+		     struct inode *new_dir)
+{
+	if (nfs_async_handle_expired_key(task))
+		return 0;
+	nfs_mark_for_revalidate(old_dir);
+	nfs_mark_for_revalidate(new_dir);
+	return 1;
+}
+
 static int
 nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		struct inode *new_dir, struct qstr *new_name)
 {
 	struct nfs_renameargs	arg = {
-		.fromfh		= NFS_FH(old_dir),
-		.fromname	= old_name->name,
-		.fromlen	= old_name->len,
-		.tofh		= NFS_FH(new_dir),
-		.toname		= new_name->name,
-		.tolen		= new_name->len
+		.old_dir	= NFS_FH(old_dir),
+		.old_name	= old_name,
+		.new_dir	= NFS_FH(new_dir),
+		.new_name	= new_name,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
  */
 static int
 nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-		 u64 cookie, struct page *page, unsigned int count, int plus)
+		 u64 cookie, struct page **pages, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
 		.cookie		= cookie,
 		.count		= count,
-		.pages		= &page,
+		.pages		= pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.unlink_setup	= nfs_proc_unlink_setup,
 	.unlink_done	= nfs_proc_unlink_done,
 	.rename		= nfs_proc_rename,
+	.rename_setup	= nfs_proc_rename_setup,
+	.rename_done	= nfs_proc_rename_done,
 	.link		= nfs_proc_link,
 	.symlink	= nfs_proc_symlink,
 	.mkdir		= nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6e2b06e6ca7..e4b62c6f5a6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
+	pnfs_update_layout(inode, ctx, IOMODE_READ);
 	new = nfs_create_request(ctx, inode, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
@@ -190,6 +191,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
@@ -410,7 +412,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
 
-	if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
 				&data->args.seq_args, &data->res.seq_res,
 				0, task))
 		return;
@@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
+	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a5..0a42e8f4adc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 	Opt_lookupcache,
 	Opt_fscache_uniq,
+	Opt_local_lock,
 
 	/* Special mount options */
 	Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
 
 	{ Opt_lookupcache, "lookupcache=%s" },
 	{ Opt_fscache_uniq, "fsc=%s" },
+	{ Opt_local_lock, "local_lock=%s" },
 
 	{ Opt_err, NULL }
 };
@@ -236,14 +238,30 @@ static match_table_t nfs_lookupcache_tokens = {
 	{ Opt_lookupcache_err, NULL }
 };
 
+enum {
+	Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+	Opt_local_lock_none,
+
+	Opt_local_lock_err
+};
+
+static match_table_t nfs_local_lock_tokens = {
+	{ Opt_local_lock_all, "all" },
+	{ Opt_local_lock_flock, "flock" },
+	{ Opt_local_lock_posix, "posix" },
+	{ Opt_local_lock_none, "none" },
+
+	{ Opt_local_lock_err, NULL }
+};
+
 
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_xdev_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
 struct file_system_type nfs_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
-	.get_sb		= nfs_xdev_get_sb,
+	.mount		= nfs_xdev_mount,
 	.kill_sb	= nfs_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -270,7 +288,7 @@ static const struct super_operations nfs_sops = {
 	.write_inode	= nfs_write_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
-	.clear_inode	= nfs_clear_inode,
+	.evict_inode	= nfs_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
@@ -284,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
 	struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
 
 static struct file_system_type nfs4_fs_type = {
@@ -305,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
 static struct file_system_type nfs4_remote_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_remote_get_sb,
+	.mount		= nfs4_remote_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -313,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 struct file_system_type nfs4_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_xdev_get_sb,
+	.mount		= nfs4_xdev_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -321,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
 static struct file_system_type nfs4_remote_referral_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_remote_referral_get_sb,
+	.mount		= nfs4_remote_referral_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -340,7 +358,7 @@ static const struct super_operations nfs4_sops = {
 	.write_inode	= nfs_write_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
-	.clear_inode	= nfs4_clear_inode,
+	.evict_inode	= nfs4_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
@@ -431,7 +449,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		goto out_err;
 
 	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+	if (unlikely(error == -ESTALE)) {
+		struct dentry *pd_dentry;
 
+		pd_dentry = dget_parent(dentry);
+		if (pd_dentry != NULL) {
+			nfs_zap_caches(pd_dentry->d_inode);
+			dput(pd_dentry);
+		}
+	}
 	nfs_free_fattr(res.fattr);
 	if (error < 0)
 		goto out_err;
@@ -546,6 +572,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 {
 	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
 
+	if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+		return;
+
 	switch (sap->sa_family) {
 	case AF_INET: {
 		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -611,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	const struct proc_nfs_info *nfs_infop;
 	struct nfs_client *clp = nfss->nfs_client;
 	u32 version = clp->rpc_ops->version;
+	int local_flock, local_fcntl;
 
 	seq_printf(m, ",vers=%u", version);
 	seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -652,6 +682,25 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 
 	if (nfss->options & NFS_OPTION_FSCACHE)
 		seq_printf(m, ",fsc");
+
+	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+			seq_printf(m, ",lookupcache=none");
+		else
+			seq_printf(m, ",lookupcache=pos");
+	}
+
+	local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+	local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+	if (!local_flock && !local_fcntl)
+		seq_printf(m, ",local_lock=none");
+	else if (local_flock && local_fcntl)
+		seq_printf(m, ",local_lock=all");
+	else if (local_flock)
+		seq_printf(m, ",local_lock=flock");
+	else
+		seq_printf(m, ",local_lock=posix");
 }
 
 /*
@@ -999,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 		case Opt_lock:
 			mnt->flags &= ~NFS_MOUNT_NONLM;
+			mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+					NFS_MOUNT_LOCAL_FCNTL);
 			break;
 		case Opt_nolock:
 			mnt->flags |= NFS_MOUNT_NONLM;
+			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+				       NFS_MOUNT_LOCAL_FCNTL);
 			break;
 		case Opt_v2:
 			mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1402,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->fscache_uniq = string;
 			mnt->options |= NFS_OPTION_FSCACHE;
 			break;
+		case Opt_local_lock:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string, nfs_local_lock_tokens,
+					args);
+			kfree(string);
+			switch (token) {
+			case Opt_local_lock_all:
+				mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+					       NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			case Opt_local_lock_flock:
+				mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+				break;
+			case Opt_local_lock_posix:
+				mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+				break;
+			case Opt_local_lock_none:
+				mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+						NFS_MOUNT_LOCAL_FCNTL);
+				break;
+			default:
+				dfprintk(MOUNT, "NFS:	invalid	"
+						"local_lock argument\n");
+				return 0;
+			};
+			break;
 
 		/*
 		 * Special options
@@ -1780,6 +1861,7 @@ static int nfs_validate_mount_data(void *options,
 		 * can deal with.
 		 */
 		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
+		args->flags		|= NFS_MOUNT_LEGACY_INTERFACE;
 		args->rsize		= data->rsize;
 		args->wsize		= data->wsize;
 		args->timeo		= data->timeo;
@@ -1806,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
+		if (!(data->flags & NFS_MOUNT_NONLM))
+			args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+					 NFS_MOUNT_LOCAL_FCNTL);
+		else
+			args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+					NFS_MOUNT_LOCAL_FCNTL);
 		/*
 		 * The legacy version 6 binary mount data from userspace has a
 		 * field used only to transport selinux information into the
@@ -2309,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
 /*
  * Clone an NFS2/3 server record on xdev traversal (FSID-change)
  */
-static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
-			   const char *dev_name, void *raw_data,
-			   struct vfsmount *mnt)
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -2323,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	};
 	int error;
 
-	dprintk("--> nfs_xdev_get_sb()\n");
+	dprintk("--> nfs_xdev_mount()\n");
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2370,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	/* clone any lsm security options from the parent to the new sb */
 	security_sb_clone_mnt_opts(data->sb, s);
 
-	dprintk("<-- nfs_xdev_get_sb() = 0\n");
-	return 0;
+	dprintk("<-- nfs_xdev_mount() = 0\n");
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
-	dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
-	return error;
+	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
 		bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
 	deactivate_locked_super(s);
-	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
-	return error;
+	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
 }
 
 #ifdef CONFIG_NFS_V4
@@ -2422,7 +2508,8 @@ static void nfs4_fill_super(struct super_block *sb)
 
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 {
-	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 
 static int nfs4_validate_text_mount_data(void *options,
@@ -2560,8 +2647,9 @@ out_no_address:
 /*
  * Get the superblock for the NFS4 root partition
  */
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+static struct dentry *
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+		  const char *dev_name, void *raw_data)
 {
 	struct nfs_parsed_mount_data *data = raw_data;
 	struct super_block *s;
@@ -2625,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 		goto error_splat_root;
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
-	error = 0;
+
+	security_free_mnt_opts(&data->lsm_opts);
+	nfs_free_fhandle(mntfh);
+	return mntroot;
 
 out:
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
 	nfs_free_fhandle(mntfh);
-	return error;
+	return ERR_PTR(error);
 
 out_free:
 	nfs_free_server(server);
@@ -2879,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
 /*
  * Clone an NFS4 server record on xdev traversal (FSID-change)
  */
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *raw_data,
-			    struct vfsmount *mnt)
+static struct dentry *
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
+		 const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -2893,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	};
 	int error;
 
-	dprintk("--> nfs4_xdev_get_sb()\n");
+	dprintk("--> nfs4_xdev_mount()\n");
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2940,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	security_sb_clone_mnt_opts(data->sb, s);
 
-	dprintk("<-- nfs4_xdev_get_sb() = 0\n");
-	return 0;
+	dprintk("<-- nfs4_xdev_mount() = 0\n");
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
-	dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
-	return error;
+	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
 		bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
 	deactivate_locked_super(s);
-	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
-	return error;
+	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
+	return ERR_PTR(error);
 }
 
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data,
-		struct vfsmount *mnt)
+static struct dentry *
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
+			   const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	struct super_block *s;
@@ -3029,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	}
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
 
 	security_sb_clone_mnt_opts(data->sb, s);
 
 	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = 0\n");
-	return 0;
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
@@ -3044,7 +3129,7 @@ out_err_noserver:
 	nfs_free_fhandle(mntfh);
 out_err_nofh:
 	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-	return error;
+	return ERR_PTR(error);
 
 error_splat_super:
 	if (server && !s->s_root)
@@ -3053,7 +3138,7 @@ error_splat_bdi:
 	deactivate_locked_super(s);
 	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b2..978aaeb8a09 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
 	{
 		.procname = "idmap_cache_timeout",
 		.data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode = 0644,
 		.proc_handler = proc_dointvec_jiffies,
 	},
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
 	{
 		.procname	= "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index a2242af6a17..7bdec853140 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/namei.h>
 
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
 
 struct nfs_unlinkdata {
 	struct hlist_node list;
@@ -110,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
 	struct nfs_unlinkdata *data = calldata;
 	struct nfs_server *server = NFS_SERVER(data->dir);
 
-	if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
 				&data->res.seq_res, 1, task))
 		return;
 	rpc_call_start(task);
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
  * @dir: parent directory of dentry
  * @dentry: dentry to unlink
  */
-int
+static int
 nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 		status = PTR_ERR(data->cred);
 		goto out_free;
 	}
-	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	data->res.dir_attr = &data->dir_attr;
 
 	status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
 		nfs_free_unlinkdata(data);
 }
+
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		spin_unlock(&dentry->d_lock);
+		nfs_free_unlinkdata(data);
+		return;
+	}
+	spin_unlock(&dentry->d_lock);
+}
+
+struct nfs_renamedata {
+	struct nfs_renameargs	args;
+	struct nfs_renameres	res;
+	struct rpc_cred		*cred;
+	struct inode		*old_dir;
+	struct dentry		*old_dentry;
+	struct nfs_fattr	old_fattr;
+	struct inode		*new_dir;
+	struct dentry		*new_dentry;
+	struct nfs_fattr	new_fattr;
+};
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct inode *old_dir = data->old_dir;
+	struct inode *new_dir = data->new_dir;
+
+	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+		return;
+	}
+
+	if (task->tk_status != 0) {
+		nfs_cancel_async_unlink(data->old_dentry);
+		return;
+	}
+
+	nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+	d_move(data->old_dentry, data->new_dentry);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+	struct nfs_renamedata	*data = calldata;
+	struct super_block *sb = data->old_dir->i_sb;
+
+	if (data->old_dentry->d_inode)
+		nfs_mark_for_revalidate(data->old_dentry->d_inode);
+
+	dput(data->old_dentry);
+	dput(data->new_dentry);
+	iput(data->old_dir);
+	iput(data->new_dir);
+	nfs_sb_deactive(sb);
+	put_rpccred(data->cred);
+	kfree(data);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_renamedata *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->old_dir);
+
+	if (nfs4_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, 1, task))
+		return;
+	rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct rpc_call_ops nfs_rename_ops = {
+	.rpc_call_done = nfs_async_rename_done,
+	.rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+	.rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+		 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+	struct nfs_renamedata *data;
+	struct rpc_message msg = { };
+	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
+		.callback_ops = &nfs_rename_ops,
+		.workqueue = nfsiod_workqueue,
+		.rpc_client = NFS_CLIENT(old_dir),
+		.flags = RPC_TASK_ASYNC,
+	};
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return ERR_PTR(-ENOMEM);
+	task_setup_data.callback_data = data,
+
+	data->cred = rpc_lookup_cred();
+	if (IS_ERR(data->cred)) {
+		struct rpc_task *task = ERR_CAST(data->cred);
+		kfree(data);
+		return task;
+	}
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	msg.rpc_cred = data->cred;
+
+	/* set up nfs_renamedata */
+	data->old_dir = old_dir;
+	ihold(old_dir);
+	data->new_dir = new_dir;
+	ihold(new_dir);
+	data->old_dentry = dget(old_dentry);
+	data->new_dentry = dget(new_dentry);
+	nfs_fattr_init(&data->old_fattr);
+	nfs_fattr_init(&data->new_fattr);
+
+	/* set up nfs_renameargs */
+	data->args.old_dir = NFS_FH(old_dir);
+	data->args.old_name = &old_dentry->d_name;
+	data->args.new_dir = NFS_FH(new_dir);
+	data->args.new_name = &new_dentry->d_name;
+
+	/* set up nfs_renameres */
+	data->res.old_fattr = &data->old_fattr;
+	data->res.new_fattr = &data->new_fattr;
+
+	nfs_sb_active(old_dir->i_sb);
+
+	NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+
+	return rpc_run_task(&task_setup_data);
+}
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+	static unsigned int sillycounter;
+	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+	const int      countersize = sizeof(sillycounter)*2;
+	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+	char           silly[slen+1];
+	struct dentry *sdentry;
+	struct rpc_task *task;
+	int            error = -EIO;
+
+	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		atomic_read(&dentry->d_count));
+	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+	/*
+	 * We don't allow a dentry to be silly-renamed twice.
+	 */
+	error = -EBUSY;
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out;
+
+	sprintf(silly, ".nfs%*.*Lx",
+		fileidsize, fileidsize,
+		(unsigned long long)NFS_FILEID(dentry->d_inode));
+
+	/* Return delegation in anticipation of the rename */
+	nfs_inode_return_delegation(dentry->d_inode);
+
+	sdentry = NULL;
+	do {
+		char *suffix = silly + slen - countersize;
+
+		dput(sdentry);
+		sillycounter++;
+		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+
+		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+				dentry->d_name.name, silly);
+
+		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+		/*
+		 * N.B. Better to return EBUSY here ... it could be
+		 * dangerous to delete the file while it's in use.
+		 */
+		if (IS_ERR(sdentry))
+			goto out;
+	} while (sdentry->d_inode != NULL); /* need negative lookup */
+
+	/* queue unlink first. Can't do this from rpc_release as it
+	 * has to allocate memory
+	 */
+	error = nfs_async_unlink(dir, dentry);
+	if (error)
+		goto out_dput;
+
+	/* run the rename task, undo unlink if it fails */
+	task = nfs_async_rename(dir, dir, dentry, sdentry);
+	if (IS_ERR(task)) {
+		error = -EBUSY;
+		nfs_cancel_async_unlink(dentry);
+		goto out_dput;
+	}
+
+	/* wait for the RPC task to complete, unless a SIGKILL intervenes */
+	error = rpc_wait_for_completion_task(task);
+	if (error == 0)
+		error = task->tk_status;
+	rpc_put_task(task);
+out_dput:
+	dput(sdentry);
+out:
+	return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f81bdd91c5..4c14c17a527 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 	}
 	return p;
 }
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
 		p->npages = pagecount;
-		p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	ret = nfs_page_async_flush(pgio, page,
-			wbc->sync_mode == WB_SYNC_NONE ||
-			wbc->nonblocking != 0);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -700,7 +696,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			return 0;
-		do_flush = req->wb_page != page || req->wb_context != ctx;
+		do_flush = req->wb_page != page || req->wb_context != ctx ||
+			req->wb_lock_context->lockowner != current->files ||
+			req->wb_lock_context->pid != current->tgid;
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
@@ -824,6 +822,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
@@ -1047,9 +1046,9 @@ out:
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
-	struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
 
-	if (nfs4_setup_sequence(clp, &data->args.seq_args,
+	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+				&data->args.seq_args,
 				&data->res.seq_res, 1, task))
 		return;
 	rpc_call_start(task);
@@ -1430,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int flags = FLUSH_SYNC;
 	int ret = 0;
 
-	/* Don't commit yet if this is a non-blocking flush and there are
-	 * lots of outstanding writes for this mapping.
-	 */
-	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    nfsi->ncommit <= (nfsi->npages >> 1))
-		goto out_mark_dirty;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		/* Don't commit yet if this is a non-blocking flush and there
+		 * are a lot of outstanding writes for this mapping.
+		 */
+		if (nfsi->ncommit <= (nfsi->npages >> 1))
+			goto out_mark_dirty;
 
-	if (wbc->nonblocking || wbc->for_background)
+		/* don't wait for the COMMIT response */
 		flags = 0;
+	}
+
 	ret = nfs_commit_inode(inode, flags);
 	if (ret >= 0) {
 		if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a..18b3e8975fe 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
 
 	  If unsure, say N.
 
+config NFSD_DEPRECATED
+	bool "Include support for deprecated syscall interface to NFSD"
+	depends on NFSD
+	default y
+	help
+	  The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+	  filesystem based interface.  The old interface is due for removal
+	  in 2.6.40.  If you wish to remove the interface before then
+	  say N.
+
+	  In unsure, say Y.
+
 config NFSD_V2_ACL
 	bool
 	depends on NFSD
@@ -69,7 +81,7 @@ config NFSD_V4
 	depends on NFSD && PROC_FS && EXPERIMENTAL
 	select NFSD_V3
 	select FS_POSIX_ACL
-	select RPCSEC_GSS_KRB5
+	select SUNRPC_GSS
 	help
 	  This option enables support in your system's NFS server for
 	  version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87d..c0fcb7ab7f6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
 typedef struct auth_domain	svc_client;
 typedef struct svc_export	svc_export;
 
-static void		exp_do_unexport(svc_export *unexp);
-static int		exp_verify_string(char *cp, int max);
-
 /*
  * We have two caches.
  * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
 	return ek;
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 		       struct svc_export *exp)
 {
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
 
 	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
+#endif
 
 static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 				     struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
 	return exp;
 }
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
  * Hashtable locking. Write locks are placed only by user processes
  * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
 {
 	up_write(&hash_sem);
 }
+#else
+
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+
+#endif
+
+#ifdef CONFIG_NFSD_DEPRECATED
+static void		exp_do_unexport(svc_export *unexp);
+static int		exp_verify_string(char *cp, int max);
 
 static void exp_fsid_unhash(struct svc_export *exp)
 {
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
 
 	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
 	if (!IS_ERR(ek)) {
-		ek->h.expiry_time = get_seconds()-1;
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
 		cache_put(&ek->h, &svc_expkey_cache);
 	}
-	svc_expkey_cache.nextcheck = get_seconds();
 }
 
 static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
 
 	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
 	if (!IS_ERR(ek)) {
-		ek->h.expiry_time = get_seconds()-1;
+		sunrpc_invalidate(&ek->h, &svc_expkey_cache);
 		cache_put(&ek->h, &svc_expkey_cache);
 	}
-	svc_expkey_cache.nextcheck = get_seconds();
 }
 	
 /*
@@ -1097,8 +1108,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-	unexp->h.expiry_time = get_seconds()-1;
-	svc_export_cache.nextcheck = get_seconds();
+	sunrpc_invalidate(&unexp->h, &svc_export_cache);
 	exp_unhash(unexp);
 	exp_fsid_unhash(unexp);
 }
@@ -1150,6 +1160,7 @@ out_unlock:
 	exp_writeunlock();
 	return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /*
  * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
 	show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
 }
 
+static bool secinfo_flags_equal(int f, int g)
+{
+	f &= NFSEXP_SECINFO_FLAGS;
+	g &= NFSEXP_SECINFO_FLAGS;
+	return f == g;
+}
+
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+	int flags;
+
+	flags = (*fp)->flags;
+	seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+	(*fp)++;
+	while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+		seq_printf(m, ":%d", (*fp)->pseudoflavor);
+		(*fp)++;
+	}
+	return flags;
+}
+
 static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 {
 	struct exp_flavor_info *f;
 	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-	int lastflags = 0, first = 0;
+	int flags;
 
 	if (exp->ex_nflavors == 0)
 		return;
-	for (f = exp->ex_flavors; f < end; f++) {
-		if (first || f->flags != lastflags) {
-			if (!first)
-				show_secinfo_flags(m, lastflags);
-			seq_printf(m, ",sec=%d", f->pseudoflavor);
-			lastflags = f->flags;
-		} else {
-			seq_printf(m, ":%d", f->pseudoflavor);
-		}
+	f = exp->ex_flavors;
+	flags = show_secinfo_run(m, &f, end);
+	if (!secinfo_flags_equal(flags, exp->ex_flags))
+		show_secinfo_flags(m, flags);
+	while (f != end) {
+		flags = show_secinfo_run(m, &f, end);
+		show_secinfo_flags(m, flags);
 	}
-	show_secinfo_flags(m, lastflags);
 }
 
 static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
 	.show	= e_show,
 };
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
  * Add or modify a client.
  * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
 	/* Insert client into hashtable. */
 	for (i = 0; i < ncp->cl_naddr; i++) {
 		ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
-		auth_unix_add_addr(&addr6, dom);
+		auth_unix_add_addr(&init_net, &addr6, dom);
 	}
 	auth_unix_forget_old(dom);
 	auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
 	printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
 	return 0;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /*
  * Initialize the exports module.
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b..5b7e3021e06 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = nfsd_read(rqstp, &resp->fh, NULL,
+	nfserr = nfsd_read(rqstp, &resp->fh,
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
 				  &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	fh_init(&resp->fh, NFS3_FHSIZE);
 	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
 				    &argp->attrs, S_IFDIR, 0, &resp->fh);
-
+	fh_unlock(&resp->dirfh);
 	RETURN_STATUS(nfserr);
 }
 
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
 	type = nfs3_ftypes[argp->ftype];
 	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
 				    &argp->attrs, type, rdev, &resp->fh);
-
+	fh_unlock(&resp->dirfh);
 	RETURN_STATUS(nfserr);
 }
 
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 	/* Unlink. -S_IFDIR means file must not be a directory */
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
 	RETURN_STATUS(nfserr);
 }
 
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+	fh_unlock(&resp->fh);
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index eb78e7e2207..143da2eecd7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
@@ -143,8 +142,6 @@ struct nfs4_cb_compound_hdr {
 	u32		minorversion;
 	/* res */
 	int		status;
-	u32		taglen;
-	char		*tag;
 };
 
 static struct {
@@ -205,6 +202,16 @@ nfs_cb_stat_to_errno(int stat)
  */
 
 static void
+encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(sizeof(stateid_t));
+	WRITE32(sid->si_generation);
+	WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+}
+
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 * p;
@@ -229,10 +236,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 	__be32 *p;
 	int len = dp->dl_fh.fh_size;
 
-	RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
+	RESERVE_SPACE(4);
 	WRITE32(OP_CB_RECALL);
-	WRITE32(dp->dl_stateid.si_generation);
-	WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
+	encode_stateid(xdr, &dp->dl_stateid);
+	RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
 	WRITE32(0); /* truncate optimization not implemented */
 	WRITE32(len);
 	WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -240,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct nfs4_cb_compound_hdr *hdr)
 {
 	__be32 *p;
+	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
 
 	if (hdr->minorversion == 0)
 		return;
@@ -251,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
 	RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
 
 	WRITE32(OP_CB_SEQUENCE);
-	WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(args->cbs_clp->cl_cb_seq_nr);
+	WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(ses->se_cb_seq_nr);
 	WRITE32(0);		/* slotid, always 0 */
 	WRITE32(0);		/* highest slotid always 0 */
 	WRITE32(0);		/* cachethis always 0 */
@@ -272,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-		struct nfs4_rpc_args *rpc_args)
+		struct nfsd4_callback *cb)
 {
 	struct xdr_stream xdr;
-	struct nfs4_delegation *args = rpc_args->args_op;
+	struct nfs4_delegation *args = cb->cb_op;
 	struct nfs4_cb_compound_hdr hdr = {
-		.ident = args->dl_ident,
-		.minorversion = rpc_args->args_seq.cbs_minorversion,
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_minorversion,
 	};
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_cb_compound_hdr(&xdr, &hdr);
-	encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
+	encode_cb_sequence(&xdr, cb, &hdr);
 	encode_cb_recall(&xdr, args, &hdr);
 	encode_cb_nops(&hdr);
 	return 0;
@@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
 static int
 decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
         __be32 *p;
+	u32 taglen;
 
         READ_BUF(8);
         READ32(hdr->status);
-        READ32(hdr->taglen);
-        READ_BUF(hdr->taglen + 4);
-        hdr->tag = (char *)p;
-        p += XDR_QUADLEN(hdr->taglen);
+	/* We've got no use for the tag; ignore it: */
+        READ32(taglen);
+        READ_BUF(taglen + 4);
+        p += XDR_QUADLEN(taglen);
         READ32(hdr->nops);
         return 0;
 }
@@ -330,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
  * with a single slot.
  */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
 		   struct rpc_rqst *rqstp)
 {
+	struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
 	struct nfs4_sessionid id;
 	int status;
 	u32 dummy;
 	__be32 *p;
 
-	if (res->cbs_minorversion == 0)
+	if (cb->cb_minorversion == 0)
 		return 0;
 
 	status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -354,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
 	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
 	p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-	if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
-		   NFS4_MAX_SESSIONID_LEN)) {
+	if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out;
 	}
 	READ32(dummy);
-	if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+	if (dummy != ses->se_cb_seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out;
 	}
@@ -384,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-		struct nfsd4_cb_sequence *seq)
+		struct nfsd4_callback *cb)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -394,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
 	status = decode_cb_compound_hdr(&xdr, &hdr);
 	if (status)
 		goto out;
-	if (seq) {
-		status = decode_cb_sequence(&xdr, seq, rqstp);
+	if (cb) {
+		status = decode_cb_sequence(&xdr, cb, rqstp);
 		if (status)
 			goto out;
 	}
@@ -464,30 +473,34 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cl_cb_set an atomic? */
 
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
 		.to_retries	= 0,
 	};
 	struct rpc_create_args args = {
-		.protocol	= XPRT_TRANSPORT_TCP,
-		.address	= (struct sockaddr *) &cb->cb_addr,
-		.addrsize	= cb->cb_addrlen,
+		.net		= &init_net,
+		.address	= (struct sockaddr *) &conn->cb_addr,
+		.addrsize	= conn->cb_addrlen,
 		.timeout	= &timeparms,
 		.program	= &cb_program,
-		.prognumber	= cb->cb_prog,
 		.version	= 0,
 		.authflavor	= clp->cl_flavor,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
-		.client_name    = clp->cl_principal,
 	};
 	struct rpc_clnt *client;
 
-	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-		return -EINVAL;
-	if (cb->cb_minorversion) {
-		args.bc_xprt = cb->cb_xprt;
+	if (clp->cl_minorversion == 0) {
+		if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+			return -EINVAL;
+		args.client_name = clp->cl_principal;
+		args.prognumber	= conn->cb_prog,
+		args.protocol = XPRT_TRANSPORT_TCP;
+		clp->cl_cb_ident = conn->cb_ident;
+	} else {
+		args.bc_xprt = conn->cb_xprt;
+		args.prognumber = clp->cl_cb_session->se_cb_prog;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
 	}
 	/* Create RPC client */
@@ -497,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 			PTR_ERR(client));
 		return PTR_ERR(client);
 	}
-	nfsd4_set_callback_client(clp, client);
+	clp->cl_cb_client = client;
 	return 0;
 
 }
@@ -510,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_client *clp = calldata;
+	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
 		warn_no_callback_path(clp, task->tk_status);
@@ -519,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 }
 
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+	/* XXX: release method to ensure we set the cb channel down if
+	 * necessary on early failure? */
 	.rpc_call_done = nfsd4_cb_probe_done,
 };
 
@@ -534,38 +549,42 @@ int set_callback_cred(void)
 	return 0;
 }
 
+static struct workqueue_struct *callback_wq;
 
-void do_probe_callback(struct nfs4_client *clp)
+static void do_probe_callback(struct nfs4_client *clp)
 {
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-		.rpc_cred	= callback_cred
-	};
-	int status;
+	struct nfsd4_callback *cb = &clp->cl_cb_null;
 
-	status = rpc_call_async(clp->cl_cb_client, &msg,
-				RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
-				&nfsd4_cb_probe_ops, (void *)clp);
-	if (status)
-		warn_no_callback_path(clp, status);
+	cb->cb_op = NULL;
+	cb->cb_clp = clp;
+
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
+	cb->cb_msg.rpc_argp = NULL;
+	cb->cb_msg.rpc_resp = NULL;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_probe_ops;
+
+	queue_work(callback_wq, &cb->cb_work);
 }
 
 /*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
  */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+void nfsd4_probe_callback(struct nfs4_client *clp)
 {
-	int status;
+	set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	do_probe_callback(clp);
+}
 
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
 	BUG_ON(atomic_read(&clp->cl_cb_set));
 
-	status = setup_callback_client(clp, cb);
-	if (status) {
-		warn_no_callback_path(clp, status);
-		return;
-	}
-	do_probe_callback(clp);
+	spin_lock(&clp->cl_lock);
+	memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+	spin_unlock(&clp->cl_lock);
 }
 
 /*
@@ -576,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		struct rpc_task *task)
 {
-	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
-	u32 *ptr = (u32 *)clp->cl_sessionid.data;
+	u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
 	int status = 0;
 
 	dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -589,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
 		status = -EAGAIN;
 		goto out;
 	}
-
-	/*
-	 * We'll need the clp during XDR encoding and decoding,
-	 * and the sequence during decoding to verify the reply
-	 */
-	args->args_seq.cbs_clp = clp;
-	task->tk_msg.rpc_resp = &args->args_seq;
-
 out:
 	dprintk("%s status=%d\n", __func__, status);
 	return status;
@@ -608,13 +618,13 @@ out:
  */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
-	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
-	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+	u32 minorversion = clp->cl_minorversion;
 	int status = 0;
 
-	args->args_seq.cbs_minorversion = minorversion;
+	cb->cb_minorversion = minorversion;
 	if (minorversion) {
 		status = nfsd41_cb_setup_sequence(clp, task);
 		if (status) {
@@ -631,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 
 	dprintk("%s: minorversion=%d\n", __func__,
-		clp->cl_cb_conn.cb_minorversion);
+		clp->cl_minorversion);
 
-	if (clp->cl_cb_conn.cb_minorversion) {
+	if (clp->cl_minorversion) {
 		/* No need for lock, access serialized in nfsd4_cb_prepare */
-		++clp->cl_cb_seq_nr;
+		++clp->cl_cb_session->se_cb_seq_nr;
 		clear_bit(0, &clp->cl_cb_slot_busy);
 		rpc_wake_up_next(&clp->cl_cb_waitq);
 		dprintk("%s: freed slot, new seqid=%d\n", __func__,
-			clp->cl_cb_seq_nr);
+			clp->cl_cb_session->se_cb_seq_nr);
 
 		/* We're done looking into the sequence information */
 		task->tk_msg.rpc_resp = NULL;
@@ -653,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 	struct nfs4_client *clp = dp->dl_client;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
@@ -667,28 +679,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 	}
 
 	switch (task->tk_status) {
-	case -EIO:
+	case 0:
+		return;
+	case -EBADHANDLE:
+	case -NFS4ERR_BAD_STATEID:
+		/* Race: client probably got cb_recall
+		 * before open reply granting delegation */
+		break;
+	default:
 		/* Network partition? */
 		atomic_set(&clp->cl_cb_set, 0);
 		warn_no_callback_path(clp, task->tk_status);
 		if (current_rpc_client != task->tk_client) {
 			/* queue a callback on the new connection: */
+			atomic_inc(&dp->dl_count);
 			nfsd4_cb_recall(dp);
 			return;
 		}
-	case -EBADHANDLE:
-	case -NFS4ERR_BAD_STATEID:
-		/* Race: client probably got cb_recall
-		 * before open reply granting delegation */
-		break;
-	default:
-		/* success, or error we can't handle */
-		return;
 	}
 	if (dp->dl_retries--) {
 		rpc_delay(task, 2*HZ);
 		task->tk_status = 0;
-		rpc_restart_call(task);
+		rpc_restart_call_prepare(task);
 		return;
 	} else {
 		atomic_set(&clp->cl_cb_set, 0);
@@ -698,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 
 static void nfsd4_cb_recall_release(void *calldata)
 {
-	struct nfs4_delegation *dp = calldata;
+	struct nfsd4_callback *cb = calldata;
+	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
 
 	nfs4_put_delegation(dp);
 }
@@ -709,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
 	.rpc_release = nfsd4_cb_recall_release,
 };
 
-static struct workqueue_struct *callback_wq;
-
 int nfsd4_create_callback_queue(void)
 {
 	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -725,59 +736,88 @@ void nfsd4_destroy_callback_queue(void)
 }
 
 /* must be called under the state lock */
-void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
 {
-	struct rpc_clnt *old = clp->cl_cb_client;
-
-	clp->cl_cb_client = new;
+	set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
 	/*
-	 * After this, any work that saw the old value of cl_cb_client will
-	 * be gone:
+	 * Note this won't actually result in a null callback;
+	 * instead, nfsd4_do_callback_rpc() will detect the killed
+	 * client, destroy the rpc client, and stop:
 	 */
+	do_probe_callback(clp);
 	flush_workqueue(callback_wq);
-	/* So we can safely shut it down: */
-	if (old)
-		rpc_shutdown_client(old);
 }
 
-/*
- * called with dp->dl_count inc'ed.
- */
-static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
+void nfsd4_release_cb(struct nfsd4_callback *cb)
 {
-	struct nfs4_client *clp = dp->dl_client;
-	struct rpc_clnt *clnt = clp->cl_cb_client;
-	struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-		.rpc_cred = callback_cred
-	};
-	int status;
+	if (cb->cb_ops->rpc_release)
+		cb->cb_ops->rpc_release(cb);
+}
 
-	if (clnt == NULL)
-		return; /* Client is shutting down; give up. */
+void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_conn conn;
+	struct nfs4_client *clp = cb->cb_clp;
+	int err;
 
-	args->args_op = dp;
-	msg.rpc_argp = args;
-	dp->dl_retries = 1;
-	status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-				&nfsd4_cb_recall_ops, dp);
-	if (status)
-		nfs4_put_delegation(dp);
+	/*
+	 * This is either an update, or the client dying; in either case,
+	 * kill the old client:
+	 */
+	if (clp->cl_cb_client) {
+		rpc_shutdown_client(clp->cl_cb_client);
+		clp->cl_cb_client = NULL;
+	}
+	if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+		return;
+	spin_lock(&clp->cl_lock);
+	/*
+	 * Only serialized callback code is allowed to clear these
+	 * flags; main nfsd code can only set them:
+	 */
+	BUG_ON(!clp->cl_cb_flags);
+	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+	spin_unlock(&clp->cl_lock);
+
+	err = setup_callback_client(clp, &conn);
+	if (err)
+		warn_no_callback_path(clp, err);
 }
 
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
-	/* XXX: for now, just send off delegation recall. */
-	/* In future, generalize to handle any sort of callback. */
-	struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
-	struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+	struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
+	struct nfs4_client *clp = cb->cb_clp;
+	struct rpc_clnt *clnt;
 
-	_nfsd4_cb_recall(dp);
-}
+	if (clp->cl_cb_flags)
+		nfsd4_process_cb_update(cb);
 
+	clnt = clp->cl_cb_client;
+	if (!clnt) {
+		/* Callback channel broken, or client killed; give up: */
+		nfsd4_release_cb(cb);
+		return;
+	}
+	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+			cb->cb_ops, cb);
+}
 
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
+	struct nfsd4_callback *cb = &dp->dl_recall;
+
+	dp->dl_retries = 1;
+	cb->cb_op = dp;
+	cb->cb_clp = dp->dl_client;
+	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+	cb->cb_msg.rpc_argp = cb;
+	cb->cb_msg.rpc_resp = cb;
+	cb->cb_msg.rpc_cred = callback_cred;
+
+	cb->cb_ops = &nfsd4_cb_recall_ops;
+	dp->dl_retries = 1;
+
 	queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf49342..f0695e815f0 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
 	cache_unregister(&nametoid_cache);
 }
 
-/*
- * Deferred request handling
- */
-
-struct idmap_defer_req {
-       struct cache_req		req;
-       struct cache_deferred_req deferred_req;
-       wait_queue_head_t	waitq;
-       atomic_t			count;
-};
-
-static inline void
-put_mdr(struct idmap_defer_req *mdr)
-{
-	if (atomic_dec_and_test(&mdr->count))
-		kfree(mdr);
-}
-
-static inline void
-get_mdr(struct idmap_defer_req *mdr)
-{
-	atomic_inc(&mdr->count);
-}
-
-static void
-idmap_revisit(struct cache_deferred_req *dreq, int toomany)
-{
-	struct idmap_defer_req *mdr =
-		container_of(dreq, struct idmap_defer_req, deferred_req);
-
-	wake_up(&mdr->waitq);
-	put_mdr(mdr);
-}
-
-static struct cache_deferred_req *
-idmap_defer(struct cache_req *req)
-{
-	struct idmap_defer_req *mdr =
-		container_of(req, struct idmap_defer_req, req);
-
-	mdr->deferred_req.revisit = idmap_revisit;
-	get_mdr(mdr);
-	return (&mdr->deferred_req);
-}
-
-static inline int
-do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
-		struct cache_detail *detail, struct ent **item,
-		struct idmap_defer_req *mdr)
-{
-	*item = lookup_fn(key);
-	if (!*item)
-		return -ENOMEM;
-	return cache_check(detail, &(*item)->h, &mdr->req);
-}
-
-static inline int
-do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
-			struct ent *key, struct cache_detail *detail,
-			struct ent **item)
-{
-	int ret = -ENOMEM;
-
-	*item = lookup_fn(key);
-	if (!*item)
-		goto out_err;
-	ret = -ETIMEDOUT;
-	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-			|| (*item)->h.expiry_time < get_seconds()
-			|| detail->flush_time > (*item)->h.last_refresh)
-		goto out_put;
-	ret = -ENOENT;
-	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
-		goto out_put;
-	return 0;
-out_put:
-	cache_put(&(*item)->h, detail);
-out_err:
-	*item = NULL;
-	return ret;
-}
-
 static int
 idmap_lookup(struct svc_rqst *rqstp,
 		struct ent *(*lookup_fn)(struct ent *), struct ent *key,
 		struct cache_detail *detail, struct ent **item)
 {
-	struct idmap_defer_req *mdr;
 	int ret;
 
-	mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
-	if (!mdr)
+	*item = lookup_fn(key);
+	if (!*item)
 		return -ENOMEM;
-	atomic_set(&mdr->count, 1);
-	init_waitqueue_head(&mdr->waitq);
-	mdr->req.defer = idmap_defer;
-	ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr);
-	if (ret == -EAGAIN) {
-		wait_event_interruptible_timeout(mdr->waitq,
-			test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ);
-		ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item);
+ retry:
+	ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
+
+	if (ret == -ETIMEDOUT) {
+		struct ent *prev_item = *item;
+		*item = lookup_fn(key);
+		if (*item != prev_item)
+			goto retry;
+		cache_put(&(*item)->h, detail);
 	}
-	put_mdr(mdr);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7..0cdfd022bb7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->cstate.session = NULL;
 	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
 	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
-	/* Use the deferral mechanism only for NFSv4.0 compounds */
-	rqstp->rq_usedeferral = (args->minorversion == 0);
+	/*
+	 * Don't use the deferral mechanism for NFSv4; compounds make it
+	 * too hard to avoid non-idempotency problems.
+	 */
+	rqstp->rq_usedeferral = 0;
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4a273475877..f1e5ec6b510 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
 */
 
 #include <linux/file.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
@@ -51,7 +51,6 @@ static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
-static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -163,14 +162,60 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
 static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
 
+static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+	BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+	atomic_inc(&fp->fi_access[oflag]);
+}
+
+static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+	if (oflag == O_RDWR) {
+		__nfs4_file_get_access(fp, O_RDONLY);
+		__nfs4_file_get_access(fp, O_WRONLY);
+	} else
+		__nfs4_file_get_access(fp, oflag);
+}
+
+static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+{
+	if (fp->fi_fds[oflag]) {
+		fput(fp->fi_fds[oflag]);
+		fp->fi_fds[oflag] = NULL;
+	}
+}
+
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
+		nfs4_file_put_fd(fp, O_RDWR);
+		nfs4_file_put_fd(fp, oflag);
+	}
+}
+
+static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+	if (oflag == O_RDWR) {
+		__nfs4_file_put_access(fp, O_RDONLY);
+		__nfs4_file_put_access(fp, O_WRONLY);
+	} else
+		__nfs4_file_put_access(fp, oflag);
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_file *fp = stp->st_file;
-	struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
 
 	dprintk("NFSD alloc_init_deleg\n");
+	/*
+	 * Major work on the lease subsystem (for example, to support
+	 * calbacks on stat) will be required before we can support
+	 * write delegations properly.
+	 */
+	if (type != NFS4_OPEN_DELEGATE_READ)
+		return NULL;
 	if (fp->fi_had_conflict)
 		return NULL;
 	if (num_delegations > max_delegations)
@@ -185,11 +230,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
+	nfs4_file_get_access(fp, O_RDONLY);
 	dp->dl_flock = NULL;
-	get_file(stp->st_vfs_file);
-	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
-	dp->dl_ident = cb->cb_ident;
 	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
@@ -222,15 +265,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-	struct file *filp = dp->dl_vfs_file;
+	struct file *filp = find_readable_file(dp->dl_file);
 
 	dprintk("NFSD: close_delegation dp %p\n",dp);
-	dp->dl_vfs_file = NULL;
-	/* The following nfsd_close may not actually close the file,
-	 * but we want to remove the lease in any case. */
 	if (dp->dl_flock)
 		vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
-	nfsd_close(filp);
+	nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 
 /* Called under the state lock. */
@@ -302,8 +342,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
 
 static void release_lock_stateid(struct nfs4_stateid *stp)
 {
+	struct file *file;
+
 	unhash_generic_stateid(stp);
-	locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+	file = find_any_file(stp->st_file);
+	if (file)
+		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
 	free_generic_stateid(stp);
 }
 
@@ -341,11 +385,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
 	}
 }
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *	OPEN allow read, deny write
+ *	OPEN allow both, deny none
+ *	DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static void
+set_access(unsigned int *access, unsigned long bmap) {
+	int i;
+
+	*access = 0;
+	for (i = 1; i < 4; i++) {
+		if (test_bit(i, &bmap))
+			*access |= i;
+	}
+}
+
+static void
+set_deny(unsigned int *deny, unsigned long bmap) {
+	int i;
+
+	*deny = 0;
+	for (i = 0; i < 4; i++) {
+		if (test_bit(i, &bmap))
+			*deny |= i ;
+	}
+}
+
+static int
+test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+	unsigned int access, deny;
+
+	set_access(&access, stp->st_access_bmap);
+	set_deny(&deny, stp->st_deny_bmap);
+	if ((access & open->op_share_deny) || (deny & open->op_share_access))
+		return 0;
+	return 1;
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+	switch (access & NFS4_SHARE_ACCESS_BOTH) {
+	case NFS4_SHARE_ACCESS_READ:
+		return O_RDONLY;
+	case NFS4_SHARE_ACCESS_WRITE:
+		return O_WRONLY;
+	case NFS4_SHARE_ACCESS_BOTH:
+		return O_RDWR;
+	}
+	BUG();
+}
+
+static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
+{
+	unsigned int access;
+
+	set_access(&access, stp->st_access_bmap);
+	return nfs4_access_to_omode(access);
+}
+
 static void release_open_stateid(struct nfs4_stateid *stp)
 {
+	int oflag = nfs4_access_bmap_to_omode(stp);
+
 	unhash_generic_stateid(stp);
 	release_stateid_lockowners(stp);
-	nfsd_close(stp->st_vfs_file);
+	nfs4_file_put_access(stp->st_file, oflag);
 	free_generic_stateid(stp);
 }
 
@@ -415,171 +533,258 @@ gen_sessionid(struct nfsd4_session *ses)
  */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
 
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+	int i;
+
+	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+		kfree(ses->se_slots[i]);
+}
+
 /*
- * Give the client the number of ca_maxresponsesize_cached slots it
- * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
- * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
- * than NFSD_MAX_SLOTS_PER_SESSION.
- *
- * If we run out of reserved DRC memory we should (up to a point)
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+
+static int nfsd4_sanitize_slot_size(u32 size)
+{
+	size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+	size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+
+	return size;
+}
+
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
  * re-negotiate active sessions and reduce their slot usage to make
  * rooom for new connections. For now we just fail the create session.
  */
-static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
 {
-	int mem, size = fchan->maxresp_cached;
+	int avail;
 
-	if (fchan->maxreqs < 1)
-		return nfserr_inval;
+	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
 
-	if (size < NFSD_MIN_HDR_SEQ_SZ)
-		size = NFSD_MIN_HDR_SEQ_SZ;
-	size -= NFSD_MIN_HDR_SEQ_SZ;
-	if (size > NFSD_SLOT_CACHE_SIZE)
-		size = NFSD_SLOT_CACHE_SIZE;
-
-	/* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
-	mem = fchan->maxreqs * size;
-	if (mem > NFSD_MAX_MEM_PER_SESSION) {
-		fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
-		if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-			fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-		mem = fchan->maxreqs * size;
-	}
+	spin_lock(&nfsd_drc_lock);
+	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
+			nfsd_drc_max_mem - nfsd_drc_mem_used);
+	num = min_t(int, num, avail / slotsize);
+	nfsd_drc_mem_used += num * slotsize;
+	spin_unlock(&nfsd_drc_lock);
+
+	return num;
+}
 
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
 	spin_lock(&nfsd_drc_lock);
-	/* bound the total session drc memory ussage */
-	if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
-		fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
-		mem = fchan->maxreqs * size;
-	}
-	nfsd_drc_mem_used += mem;
+	nfsd_drc_mem_used -= slotsize * num;
 	spin_unlock(&nfsd_drc_lock);
+}
 
-	if (fchan->maxreqs == 0)
-		return nfserr_serverfault;
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
+{
+	struct nfsd4_session *new;
+	int mem, i;
 
-	fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
-	return 0;
+	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+			+ sizeof(struct nfsd4_session) > PAGE_SIZE);
+	mem = numslots * sizeof(struct nfsd4_slot *);
+
+	new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+	if (!new)
+		return NULL;
+	/* allocate each struct nfsd4_slot and data cache in one piece */
+	for (i = 0; i < numslots; i++) {
+		mem = sizeof(struct nfsd4_slot) + slotsize;
+		new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+		if (!new->se_slots[i])
+			goto out_free;
+	}
+	return new;
+out_free:
+	while (i--)
+		kfree(new->se_slots[i]);
+	kfree(new);
+	return NULL;
 }
 
-/*
- * fchan holds the client values on input, and the server values on output
- * sv_max_mesg is the maximum payload plus one page for overhead.
- */
-static int init_forechannel_attrs(struct svc_rqst *rqstp,
-				  struct nfsd4_channel_attrs *session_fchan,
-				  struct nfsd4_channel_attrs *fchan)
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
 {
-	int status = 0;
-	__u32   maxcount = nfsd_serv->sv_max_mesg;
+	u32 maxrpc = nfsd_serv->sv_max_mesg;
 
-	/* headerpadsz set to zero in encode routine */
+	new->maxreqs = numslots;
+	new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
 
-	/* Use the client's max request and max response size if possible */
-	if (fchan->maxreq_sz > maxcount)
-		fchan->maxreq_sz = maxcount;
-	session_fchan->maxreq_sz = fchan->maxreq_sz;
+static void free_conn(struct nfsd4_conn *c)
+{
+	svc_xprt_put(c->cn_xprt);
+	kfree(c);
+}
 
-	if (fchan->maxresp_sz > maxcount)
-		fchan->maxresp_sz = maxcount;
-	session_fchan->maxresp_sz = fchan->maxresp_sz;
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
+{
+	struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+	struct nfs4_client *clp = c->cn_session->se_client;
 
-	/* Use the client's maxops if possible */
-	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
-		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
-	session_fchan->maxops = fchan->maxops;
+	spin_lock(&clp->cl_lock);
+	if (!list_empty(&c->cn_persession)) {
+		list_del(&c->cn_persession);
+		free_conn(c);
+	}
+	spin_unlock(&clp->cl_lock);
+}
 
-	/* FIXME: Error means no more DRC pages so the server should
-	 * recover pages from existing sessions. For now fail session
-	 * creation.
-	 */
-	status = set_forechannel_drc_size(fchan);
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
+{
+	struct nfsd4_conn *conn;
 
-	session_fchan->maxresp_cached = fchan->maxresp_cached;
-	session_fchan->maxreqs = fchan->maxreqs;
+	conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
+	if (!conn)
+		return NULL;
+	svc_xprt_get(rqstp->rq_xprt);
+	conn->cn_xprt = rqstp->rq_xprt;
+	conn->cn_flags = flags;
+	INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+	return conn;
+}
 
-	dprintk("%s status %d\n", __func__, status);
-	return status;
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+	conn->cn_session = ses;
+	list_add(&conn->cn_persession, &ses->se_conns);
 }
 
-static void
-free_session_slots(struct nfsd4_session *ses)
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
 {
-	int i;
+	struct nfs4_client *clp = ses->se_client;
 
-	for (i = 0; i < ses->se_fchannel.maxreqs; i++)
-		kfree(ses->se_slots[i]);
+	spin_lock(&clp->cl_lock);
+	__nfsd4_hash_conn(conn, ses);
+	spin_unlock(&clp->cl_lock);
 }
 
-/*
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+static void nfsd4_register_conn(struct nfsd4_conn *conn)
 {
-	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+	conn->cn_xpt_user.callback = nfsd4_conn_lost;
+	register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 
-static int
-alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
-		   struct nfsd4_create_session *cses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 {
-	struct nfsd4_session *new, tmp;
-	struct nfsd4_slot *sp;
-	int idx, slotsize, cachesize, i;
-	int status;
+	struct nfsd4_conn *conn;
+	u32 flags = NFS4_CDFC4_FORE;
 
-	memset(&tmp, 0, sizeof(tmp));
+	if (ses->se_flags & SESSION4_BACK_CHAN)
+		flags |= NFS4_CDFC4_BACK;
+	conn = alloc_conn(rqstp, flags);
+	if (!conn)
+		return nfserr_jukebox;
+	nfsd4_hash_conn(conn, ses);
+	nfsd4_register_conn(conn);
+	return nfs_ok;
+}
 
-	/* FIXME: For now, we just accept the client back channel attributes. */
-	tmp.se_bchannel = cses->back_channel;
-	status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
-					&cses->fore_channel);
-	if (status)
-		goto out;
+static void nfsd4_del_conns(struct nfsd4_session *s)
+{
+	struct nfs4_client *clp = s->se_client;
+	struct nfsd4_conn *c;
 
-	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
-		     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+	spin_lock(&clp->cl_lock);
+	while (!list_empty(&s->se_conns)) {
+		c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+		list_del_init(&c->cn_persession);
+		spin_unlock(&clp->cl_lock);
 
-	status = nfserr_serverfault;
-	/* allocate struct nfsd4_session and slot table pointers in one piece */
-	slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
-	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
-	if (!new)
-		goto out;
+		unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+		free_conn(c);
 
-	memcpy(new, &tmp, sizeof(*new));
+		spin_lock(&clp->cl_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+}
 
-	/* allocate each struct nfsd4_slot and data cache in one piece */
-	cachesize = slot_bytes(&new->se_fchannel);
-	for (i = 0; i < new->se_fchannel.maxreqs; i++) {
-		sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
-		if (!sp)
-			goto out_free;
-		new->se_slots[i] = sp;
+void free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int mem;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	nfsd4_del_conns(ses);
+	spin_lock(&nfsd_drc_lock);
+	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+	nfsd_drc_mem_used -= mem;
+	spin_unlock(&nfsd_drc_lock);
+	free_session_slots(ses);
+	kfree(ses);
+}
+
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new;
+	struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+	int numslots, slotsize;
+	int status;
+	int idx;
+
+	/*
+	 * Note decreasing slot size below client's request may
+	 * make it difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.
+	 */
+	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+
+	new = alloc_session(slotsize, numslots);
+	if (!new) {
+		nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+		return NULL;
 	}
+	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
 
 	new->se_client = clp;
 	gen_sessionid(new);
-	idx = hash_sessionid(&new->se_sessionid);
-	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
-	       NFS4_MAX_SESSIONID_LEN);
 
+	INIT_LIST_HEAD(&new->se_conns);
+
+	new->se_cb_seq_nr = 1;
 	new->se_flags = cses->flags;
+	new->se_cb_prog = cses->callback_prog;
 	kref_init(&new->se_ref);
+	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&client_lock);
 	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&client_lock);
 
-	status = nfs_ok;
-out:
-	return status;
-out_free:
-	free_session_slots(new);
-	kfree(new);
-	goto out;
+	status = nfsd4_new_conn(rqstp, new);
+	/* whoops: benny points out, status is ignored! (err, or bogus) */
+	if (status) {
+		free_session(&new->se_ref);
+		return NULL;
+	}
+	if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+		struct sockaddr *sa = svc_addr(rqstp);
+
+		clp->cl_cb_session = new;
+		clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+		svc_xprt_get(rqstp->rq_xprt);
+		rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+		clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+		nfsd4_probe_callback(clp);
+	}
+	return new;
 }
 
 /* caller must hold client_lock */
@@ -591,10 +796,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 
 	dump_sessionid(__func__, sessionid);
 	idx = hash_sessionid(sessionid);
-	dprintk("%s: idx is %d\n", __func__, idx);
 	/* Search in the appropriate list */
 	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
-		dump_sessionid("list traversal", &elem->se_sessionid);
 		if (!memcmp(elem->se_sessionid.data, sessionid->data,
 			    NFS4_MAX_SESSIONID_LEN)) {
 			return elem;
@@ -613,21 +816,6 @@ unhash_session(struct nfsd4_session *ses)
 	list_del(&ses->se_perclnt);
 }
 
-void
-free_session(struct kref *kref)
-{
-	struct nfsd4_session *ses;
-	int mem;
-
-	ses = container_of(kref, struct nfsd4_session, se_ref);
-	spin_lock(&nfsd_drc_lock);
-	mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-	nfsd_drc_mem_used -= mem;
-	spin_unlock(&nfsd_drc_lock);
-	free_session_slots(ses);
-	kfree(ses);
-}
-
 /* must be called under the client_lock */
 static inline void
 renew_client_locked(struct nfs4_client *clp)
@@ -694,6 +882,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				se_perclnt);
+		list_del(&ses->se_perclnt);
+		nfsd4_put_session(ses);
+	}
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -714,22 +909,18 @@ release_session_client(struct nfsd4_session *session)
 	} else
 		renew_client_locked(clp);
 	spin_unlock(&client_lock);
-	nfsd4_put_session(session);
 }
 
 /* must be called under the client_lock */
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
+	struct nfsd4_session *ses;
+
 	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
-	while (!list_empty(&clp->cl_sessions)) {
-		struct nfsd4_session  *ses;
-		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
-				 se_perclnt);
-		unhash_session(ses);
-		nfsd4_put_session(ses);
-	}
+	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+		list_del_init(&ses->se_hash);
 }
 
 static void
@@ -758,7 +949,7 @@ expire_client(struct nfs4_client *clp)
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
 		release_openowner(sop);
 	}
-	nfsd4_set_callback_client(clp, NULL);
+	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
 	list_del(&clp->cl_idhash);
@@ -843,6 +1034,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	if (clp == NULL)
 		return NULL;
 
+	INIT_LIST_HEAD(&clp->cl_sessions);
+
 	princ = svc_gss_principal(rqstp);
 	if (princ) {
 		clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -859,8 +1052,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
-	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	spin_lock_init(&clp->cl_lock);
+	INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -869,7 +1063,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	clp->cl_flavor = rqstp->rq_flavor;
 	copy_cred(&clp->cl_cred, &rqstp->rq_cred);
 	gen_confirm(clp);
-
+	clp->cl_cb_session = NULL;
 	return clp;
 }
 
@@ -981,7 +1175,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 {
-	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
+	struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
 	unsigned short expected_family;
 
 	/* Currently, we only support tcp and tcp6 for the callback channel */
@@ -994,24 +1188,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 	else
 		goto out_err;
 
-	cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+	conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
 					    se->se_callback_addr_len,
-					    (struct sockaddr *) &cb->cb_addr,
-					    sizeof(cb->cb_addr));
+					    (struct sockaddr *)&conn->cb_addr,
+					    sizeof(conn->cb_addr));
 
-	if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
+	if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
 		goto out_err;
 
-	if (cb->cb_addr.ss_family == AF_INET6)
-		((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+	if (conn->cb_addr.ss_family == AF_INET6)
+		((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
 
-	cb->cb_minorversion = 0;
-	cb->cb_prog = se->se_callback_prog;
-	cb->cb_ident = se->se_callback_ident;
+	conn->cb_prog = se->se_callback_prog;
+	conn->cb_ident = se->se_callback_ident;
 	return;
 out_err:
-	cb->cb_addr.ss_family = AF_UNSPEC;
-	cb->cb_addrlen = 0;
+	conn->cb_addr.ss_family = AF_UNSPEC;
+	conn->cb_addrlen = 0;
 	dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
 		"will not receive delegations\n",
 		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1220,7 +1413,7 @@ out_new:
 	/* Normal case */
 	new = create_client(exid->clname, dname, rqstp, &verf);
 	if (new == NULL) {
-		status = nfserr_serverfault;
+		status = nfserr_jukebox;
 		goto out;
 	}
 
@@ -1298,7 +1491,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
 	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
+	struct nfsd4_session *new;
 	struct nfsd4_clid_slot *cs_slot = NULL;
+	bool confirm_me = false;
 	int status = 0;
 
 	nfs4_lock_state();
@@ -1321,7 +1516,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 				cs_slot->sl_seqid, cr_ses->seqid);
 			goto out;
 		}
-		cs_slot->sl_seqid++;
 	} else if (unconf) {
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1334,25 +1528,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		if (status) {
 			/* an unconfirmed replay returns misordered */
 			status = nfserr_seq_misordered;
-			goto out_cache;
+			goto out;
 		}
 
-		cs_slot->sl_seqid++; /* from 0 to 1 */
-		move_to_confirmed(unconf);
-
-		if (cr_ses->flags & SESSION4_BACK_CHAN) {
-			unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-			svc_xprt_get(rqstp->rq_xprt);
-			rpc_copy_addr(
-				(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-				sa);
-			unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-			unconf->cl_cb_conn.cb_minorversion =
-				cstate->minorversion;
-			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-			unconf->cl_cb_seq_nr = 1;
-			nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-		}
+		confirm_me = true;
 		conf = unconf;
 	} else {
 		status = nfserr_stale_clientid;
@@ -1360,22 +1539,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	}
 
 	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	conf->cl_minorversion = 1;
+	/*
 	 * We do not support RDMA or persistent sessions
 	 */
 	cr_ses->flags &= ~SESSION4_PERSIST;
 	cr_ses->flags &= ~SESSION4_RDMA;
 
-	status = alloc_init_session(rqstp, conf, cr_ses);
-	if (status)
+	status = nfserr_jukebox;
+	new = alloc_init_session(rqstp, conf, cr_ses);
+	if (!new)
 		goto out;
-
-	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	status = nfs_ok;
+	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
+	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
-out_cache:
 	/* cache solo and embedded create sessions under the state lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
+	if (confirm_me)
+		move_to_confirmed(conf);
 out:
 	nfs4_unlock_state();
 	dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1429,8 +1616,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
 
 	nfs4_lock_state();
 	/* wait for callbacks */
-	nfsd4_set_callback_client(ses->se_client, NULL);
+	nfsd4_shutdown_callback(ses->se_client);
 	nfs4_unlock_state();
+
+	nfsd4_del_conns(ses);
+
 	nfsd4_put_session(ses);
 	status = nfs_ok;
 out:
@@ -1438,6 +1628,36 @@ out:
 	return status;
 }
 
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+	struct nfsd4_conn *c;
+
+	list_for_each_entry(c, &s->se_conns, cn_persession) {
+		if (c->cn_xprt == xpt) {
+			return c;
+		}
+	}
+	return NULL;
+}
+
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_conn *c;
+
+	spin_lock(&clp->cl_lock);
+	c = __nfsd4_find_conn(new->cn_xprt, ses);
+	if (c) {
+		spin_unlock(&clp->cl_lock);
+		free_conn(new);
+		return;
+	}
+	__nfsd4_hash_conn(new, ses);
+	spin_unlock(&clp->cl_lock);
+	nfsd4_register_conn(new);
+	return;
+}
+
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
 	       struct nfsd4_compound_state *cstate,
@@ -1446,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfsd4_session *session;
 	struct nfsd4_slot *slot;
+	struct nfsd4_conn *conn;
 	int status;
 
 	if (resp->opcnt != 1)
 		return nfserr_sequence_pos;
 
+	/*
+	 * Will be either used or freed by nfsd4_sequence_check_conn
+	 * below.
+	 */
+	conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+	if (!conn)
+		return nfserr_jukebox;
+
 	spin_lock(&client_lock);
 	status = nfserr_badsession;
 	session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1482,6 +1711,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status)
 		goto out;
 
+	nfsd4_sequence_check_conn(conn, session);
+	conn = NULL;
+
 	/* Success! bump slot seqid */
 	slot->sl_inuse = true;
 	slot->sl_seqid = seq->seqid;
@@ -1496,6 +1728,7 @@ out:
 		nfsd4_get_session(cstate->session);
 		atomic_inc(&session->se_client->cl_refcount);
 	}
+	kfree(conn);
 	spin_unlock(&client_lock);
 	dprintk("%s: return %d\n", __func__, ntohl(status));
 	return status;
@@ -1630,6 +1863,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		gen_clid(new);
 	}
+	/*
+	 * XXX: we should probably set this at creation time, and check
+	 * for consistent minorversion use throughout:
+	 */
+	new->cl_minorversion = 0;
 	gen_callback(new, setclid, rpc_get_scope_id(sa));
 	add_to_unconfirmed(new, strhashval);
 	setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1690,7 +1928,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			status = nfserr_clid_inuse;
 		else {
 			atomic_set(&conf->cl_cb_set, 0);
-			nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
+			nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+			nfsd4_probe_callback(conf);
 			expire_client(unconf);
 			status = nfs_ok;
 
@@ -1724,7 +1963,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			}
 			move_to_confirmed(unconf);
 			conf = unconf;
-			nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+			nfsd4_probe_callback(conf);
 			status = nfs_ok;
 		}
 	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1760,6 +1999,8 @@ alloc_init_file(struct inode *ino)
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
+		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+		memset(fp->fi_access, 0, sizeof(fp->fi_access));
 		spin_lock(&recall_lock);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
 		spin_unlock(&recall_lock);
@@ -1971,57 +2212,6 @@ static inline int deny_valid(u32 x)
 }
 
 /*
- * We store the NONE, READ, WRITE, and BOTH bits separately in the
- * st_{access,deny}_bmap field of the stateid, in order to track not
- * only what share bits are currently in force, but also what
- * combinations of share bits previous opens have used.  This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
- *
- * XXX: This enforcement is actually incomplete, since we don't keep
- * track of access/deny bit combinations; so, e.g., we allow:
- *
- *	OPEN allow read, deny write
- *	OPEN allow both, deny none
- *	DOWNGRADE allow read, deny none
- *
- * which we should reject.
- */
-static void
-set_access(unsigned int *access, unsigned long bmap) {
-	int i;
-
-	*access = 0;
-	for (i = 1; i < 4; i++) {
-		if (test_bit(i, &bmap))
-			*access |= i;
-	}
-}
-
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
-	int i;
-
-	*deny = 0;
-	for (i = 0; i < 4; i++) {
-		if (test_bit(i, &bmap))
-			*deny |= i ;
-	}
-}
-
-static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
-	unsigned int access, deny;
-
-	set_access(&access, stp->st_access_bmap);
-	set_deny(&deny, stp->st_deny_bmap);
-	if ((access & open->op_share_deny) || (deny & open->op_share_access))
-		return 0;
-	return 1;
-}
-
-/*
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
  */
@@ -2052,14 +2242,12 @@ out:
 }
 
 static inline void
-nfs4_file_downgrade(struct file *filp, unsigned int share_access)
+nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 {
-	if (share_access & NFS4_SHARE_ACCESS_WRITE) {
-		drop_file_write_access(filp);
-		spin_lock(&filp->f_lock);
-		filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
-		spin_unlock(&filp->f_lock);
-	}
+	if (share_access & NFS4_SHARE_ACCESS_WRITE)
+		nfs4_file_put_access(fp, O_WRONLY);
+	if (share_access & NFS4_SHARE_ACCESS_READ)
+		nfs4_file_put_access(fp, O_RDONLY);
 }
 
 /*
@@ -2122,22 +2310,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 }
 
 /*
- * Set the delegation file_lock back pointer.
- *
- * Called from setlease() with lock_kernel() held.
- */
-static
-void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
-{
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
-
-	dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
-	if (!dp)
-		return;
-	dp->dl_flock = new;
-}
-
-/*
  * Called from setlease() with lock_kernel() held
  */
 static
@@ -2167,7 +2339,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
 	.fl_release_private = nfsd_release_deleg_cb,
-	.fl_copy_lock = nfsd_copy_lock_deleg_cb,
 	.fl_mylease = nfsd_same_client_deleg_cb,
 	.fl_change = nfsd_change_deleg_cb,
 };
@@ -2255,6 +2426,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 	return NULL;
 }
 
+int share_access_to_flags(u32 share_access)
+{
+	share_access &= ~NFS4_SHARE_WANT_MASK;
+
+	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
+
 static __be32
 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
@@ -2265,8 +2443,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
 	*dp = find_delegation_file(fp, &open->op_delegate_stateid);
 	if (*dp == NULL)
 		goto out;
-	flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ?
-						RD_STATE : WR_STATE;
+	flags = share_access_to_flags(open->op_share_access);
 	status = nfs4_check_delegmode(*dp, flags);
 	if (status)
 		*dp = NULL;
@@ -2308,30 +2485,53 @@ nfs4_alloc_stateid(void)
 	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
 }
 
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+	int flags = 0;
+
+	if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+		flags |= NFSD_MAY_READ;
+	if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+		flags |= NFSD_MAY_WRITE;
+	return flags;
+}
+
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
+*fp, struct svc_fh *cur_fh, u32 nfs4_access)
+{
+	__be32 status;
+	int oflag = nfs4_access_to_omode(nfs4_access);
+	int access = nfs4_access_to_access(nfs4_access);
+
+	if (!fp->fi_fds[oflag]) {
+		status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
+			&fp->fi_fds[oflag]);
+		if (status == nfserr_dropit)
+			status = nfserr_jukebox;
+		if (status)
+			return status;
+	}
+	nfs4_file_get_access(fp, oflag);
+
+	return nfs_ok;
+}
+
 static __be32
 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
-		struct nfs4_delegation *dp,
-		struct svc_fh *cur_fh, int flags)
+		struct nfs4_file *fp, struct svc_fh *cur_fh,
+		struct nfsd4_open *open)
 {
 	struct nfs4_stateid *stp;
+	__be32 status;
 
 	stp = nfs4_alloc_stateid();
 	if (stp == NULL)
 		return nfserr_resource;
 
-	if (dp) {
-		get_file(dp->dl_vfs_file);
-		stp->st_vfs_file = dp->dl_vfs_file;
-	} else {
-		__be32 status;
-		status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
-				&stp->st_vfs_file);
-		if (status) {
-			if (status == nfserr_dropit)
-				status = nfserr_jukebox;
-			kmem_cache_free(stateid_slab, stp);
-			return status;
-		}
+	status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
+	if (status) {
+		kmem_cache_free(stateid_slab, stp);
+		return status;
 	}
 	*stpp = stp;
 	return 0;
@@ -2353,35 +2553,28 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-	struct file *filp = stp->st_vfs_file;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	unsigned int share_access, new_writer;
+	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+	bool new_access;
 	__be32 status;
 
-	set_access(&share_access, stp->st_access_bmap);
-	new_writer = (~share_access) & open->op_share_access
-			& NFS4_SHARE_ACCESS_WRITE;
-
-	if (new_writer) {
-		int err = get_write_access(inode);
-		if (err)
-			return nfserrno(err);
-		err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
-		if (err)
-			return nfserrno(err);
-		file_take_write(filp);
+	new_access = !test_bit(op_share_access, &stp->st_access_bmap);
+	if (new_access) {
+		status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
+		if (status)
+			return status;
 	}
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status) {
-		if (new_writer)
-			put_write_access(inode);
+		if (new_access) {
+			int oflag = nfs4_access_to_omode(new_access);
+			nfs4_file_put_access(fp, oflag);
+		}
 		return status;
 	}
 	/* remember the open */
-	filp->f_mode |= open->op_share_access;
-	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 
 	return nfs_ok;
@@ -2404,7 +2597,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
 	int cb_up = atomic_read(&sop->so_client->cl_cb_set);
-	struct file_lock fl, *flp = &fl;
+	struct file_lock *fl;
 	int status, flag = 0;
 
 	flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2438,20 +2631,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		flag = NFS4_OPEN_DELEGATE_NONE;
 		goto out;
 	}
-	locks_init_lock(&fl);
-	fl.fl_lmops = &nfsd_lease_mng_ops;
-	fl.fl_flags = FL_LEASE;
-	fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-	fl.fl_end = OFFSET_MAX;
-	fl.fl_owner =  (fl_owner_t)dp;
-	fl.fl_file = stp->st_vfs_file;
-	fl.fl_pid = current->tgid;
+	status = -ENOMEM;
+	fl = locks_alloc_lock();
+	if (!fl)
+		goto out;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd_lease_mng_ops;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner =  (fl_owner_t)dp;
+	fl->fl_file = find_readable_file(stp->st_file);
+	BUG_ON(!fl->fl_file);
+	fl->fl_pid = current->tgid;
+	dp->dl_flock = fl;
 
 	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callbacks fl_mylease and fl_change are used
 	 */
-	if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) {
+	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
+		dp->dl_flock = NULL;
+		locks_free_lock(fl);
 		unhash_delegation(dp);
 		flag = NFS4_OPEN_DELEGATE_NONE;
 		goto out;
@@ -2514,18 +2715,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 */
 	if (stp) {
 		/* Stateid was found, this is an OPEN upgrade */
-		status = nfs4_upgrade_open(rqstp, current_fh, stp, open);
+		status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
 		if (status)
 			goto out;
 		update_stateid(&stp->st_stateid);
 	} else {
-		/* Stateid was not found, this is a new OPEN */
-		int flags = 0;
-		if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-			flags |= NFSD_MAY_READ;
-		if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-			flags |= NFSD_MAY_WRITE;
-		status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
+		status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
 		if (status)
 			goto out;
 		init_stateid(stp, fp, open);
@@ -2727,7 +2922,7 @@ search_close_lru(u32 st_id, int flags)
 static inline int
 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 {
-	return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode;
+	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
 }
 
 static int
@@ -2760,6 +2955,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
 {
         __be32 status = nfserr_openmode;
 
+	/* For lock stateid's, we test the parent open, not the lock: */
+	if (stp->st_openstp)
+		stp = stp->st_openstp;
 	if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
                 goto out;
 	if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2858,7 +3056,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	if (STALE_STATEID(stateid)) 
 		goto out;
 
-	status = nfserr_bad_stateid;
+	/*
+	 * We assume that any stateid that has the current boot time,
+	 * but that we can't find, is expired:
+	 */
+	status = nfserr_expired;
 	if (is_delegation_stateid(stateid)) {
 		dp = find_delegation_stateid(ino, stateid);
 		if (!dp)
@@ -2872,11 +3074,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			goto out;
 		renew_client(dp->dl_client);
 		if (filpp)
-			*filpp = dp->dl_vfs_file;
+			*filpp = find_readable_file(dp->dl_file);
+		BUG_ON(!*filpp);
 	} else { /* open or lock stateid */
 		stp = find_stateid(stateid, flags);
 		if (!stp)
 			goto out;
+		status = nfserr_bad_stateid;
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
@@ -2889,8 +3093,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 		if (status)
 			goto out;
 		renew_client(stp->st_stateowner->so_client);
-		if (filpp)
-			*filpp = stp->st_vfs_file;
+		if (filpp) {
+			if (flags & RD_STATE)
+				*filpp = find_readable_file(stp->st_file);
+			else
+				*filpp = find_writeable_file(stp->st_file);
+		}
 	}
 	status = nfs_ok;
 out:
@@ -2947,8 +3155,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		 * a replayed close:
 		 */
 		sop = search_close_lru(stateid->si_stateownerid, flags);
+		/* It's not stale; let's assume it's expired: */
 		if (sop == NULL)
-			return nfserr_bad_stateid;
+			return nfserr_expired;
 		*sopp = sop;
 		goto check_replay;
 	}
@@ -3126,8 +3335,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		goto out;
 	}
 	set_access(&share_access, stp->st_access_bmap);
-	nfs4_file_downgrade(stp->st_vfs_file,
-	                    share_access & ~od->od_share_access);
+	nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
 
 	reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
 	reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3214,6 +3422,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfserr_bad_stateid;
 	if (!is_delegation_stateid(stateid))
 		goto out;
+	status = nfserr_expired;
 	dp = find_delegation_stateid(inode, stateid);
 	if (!dp)
 		goto out;
@@ -3346,11 +3555,9 @@ static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
 	struct nfs4_stateowner *sop;
-	unsigned int hval;
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
 		sop = (struct nfs4_stateowner *) fl->fl_owner;
-		hval = lockownerid_hashval(sop->so_id);
 		kref_get(&sop->so_ref);
 		deny->ld_sop = sop;
 		deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3446,8 +3653,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	stp->st_stateid.si_generation = 0;
-	stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
-	stp->st_access_bmap = open_stp->st_access_bmap;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
 
@@ -3472,7 +3677,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_stateowner *open_sop = NULL;
 	struct nfs4_stateowner *lock_sop = NULL;
 	struct nfs4_stateid *lock_stp;
-	struct file *filp;
+	struct nfs4_file *fp;
+	struct file *filp = NULL;
 	struct file_lock file_lock;
 	struct file_lock conflock;
 	__be32 status = 0;
@@ -3502,7 +3708,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * lock stateid.
 		 */
 		struct nfs4_stateid *open_stp = NULL;
-		struct nfs4_file *fp;
 		
 		status = nfserr_stale_clientid;
 		if (!nfsd4_has_session(cstate) &&
@@ -3545,9 +3750,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (status)
 			goto out;
 		lock_sop = lock->lk_replay_owner;
+		fp = lock_stp->st_file;
 	}
 	/* lock->lk_replay_owner and lock_stp have been created or found */
-	filp = lock_stp->st_vfs_file;
 
 	status = nfserr_grace;
 	if (locks_in_grace() && !lock->lk_reclaim)
@@ -3560,11 +3765,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (lock->lk_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
+			if (find_readable_file(lock_stp->st_file)) {
+				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+				filp = find_readable_file(lock_stp->st_file);
+			}
 			file_lock.fl_type = F_RDLCK;
 			cmd = F_SETLK;
 		break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
+			if (find_writeable_file(lock_stp->st_file)) {
+				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+				filp = find_writeable_file(lock_stp->st_file);
+			}
 			file_lock.fl_type = F_WRLCK;
 			cmd = F_SETLK;
 		break;
@@ -3572,6 +3785,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			status = nfserr_inval;
 		goto out;
 	}
+	if (!filp) {
+		status = nfserr_openmode;
+		goto out;
+	}
 	file_lock.fl_owner = (fl_owner_t)lock_sop;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
@@ -3740,7 +3957,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&locku->lu_stateowner, &stp, NULL)))
 		goto out;
 
-	filp = stp->st_vfs_file;
+	filp = find_any_file(stp->st_file);
+	if (!filp) {
+		status = nfserr_lock_range;
+		goto out;
+	}
 	BUG_ON(!filp);
 	locks_init_lock(&file_lock);
 	file_lock.fl_type = F_UNLCK;
@@ -3787,13 +4008,13 @@ out_nfserr:
  * 	0: no locks held by lockowner
  */
 static int
-check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
 {
 	struct file_lock **flpp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = filp->fi_inode;
 	int status = 0;
 
-	lock_kernel();
+	lock_flocks();
 	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
 		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
 			status = 1;
@@ -3801,7 +4022,7 @@ check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
 		}
 	}
 out:
-	unlock_kernel();
+	unlock_flocks();
 	return status;
 }
 
@@ -3841,7 +4062,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 				continue;
 			list_for_each_entry(stp, &sop->so_stateids,
 					st_perstateowner) {
-				if (check_for_locks(stp->st_vfs_file, sop))
+				if (check_for_locks(stp->st_file, sop))
 					goto out;
 				/* Note: so_perclient unused for lockowners,
 				 * so it's OK to fool with here. */
@@ -4066,16 +4287,8 @@ out_free_laundry:
 int
 nfs4_state_start(void)
 {
-	int ret;
-
-	if (nfs4_init)
-		return 0;
 	nfsd4_load_reboot_recovery_data();
-	ret = __nfs4_state_start();
-	if (ret)
-		return ret;
-	nfs4_init = 1;
-	return 0;
+	return __nfs4_state_start();
 }
 
 static void
@@ -4110,7 +4323,6 @@ __nfs4_state_shutdown(void)
 	}
 
 	nfsd4_shutdown_recdir();
-	nfs4_init = 0;
 }
 
 void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ac17a708023..f35a94a0402 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1756,6 +1756,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	struct nfs4_acl *acl = NULL;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
+	struct path path = {
+		.mnt	= exp->ex_path.mnt,
+		.dentry	= dentry,
+	};
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
 	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -1776,7 +1780,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			FATTR4_WORD0_MAXNAME)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		err = vfs_statfs(dentry, &statfs);
+		err = vfs_statfs(&path, &statfs);
 		if (err)
 			goto out_nfserr;
 	}
@@ -1801,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 				goto out_nfserr;
 		}
 	}
-	if ((buflen -= 16) < 0)
-		goto out_resource;
 
-	if (unlikely(bmval2)) {
+	if (bmval2) {
+		if ((buflen -= 16) < 0)
+			goto out_resource;
 		WRITE32(3);
 		WRITE32(bmval0);
 		WRITE32(bmval1);
 		WRITE32(bmval2);
-	} else if (likely(bmval1)) {
+	} else if (bmval1) {
+		if ((buflen -= 12) < 0)
+			goto out_resource;
 		WRITE32(2);
 		WRITE32(bmval0);
 		WRITE32(bmval1);
 	} else {
+		if ((buflen -= 8) < 0)
+			goto out_resource;
 		WRITE32(1);
 		WRITE32(bmval0);
 	}
@@ -1824,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		u32 word1 = nfsd_suppattrs1(minorversion);
 		u32 word2 = nfsd_suppattrs2(minorversion);
 
-		if ((buflen -= 12) < 0)
-			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!word2) {
+			if ((buflen -= 12) < 0)
+				goto out_resource;
 			WRITE32(2);
 			WRITE32(word0);
 			WRITE32(word1);
 		} else {
+			if ((buflen -= 16) < 0)
+				goto out_resource;
 			WRITE32(3);
 			WRITE32(word0);
 			WRITE32(word1);
@@ -2630,7 +2640,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	}
 	read->rd_vlen = v;
 
-	nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+	nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
 			&maxcount);
 
@@ -3325,6 +3335,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		}
 		/* Renew the clientid on success and on replay */
 		release_session_client(cs->session);
+		nfsd4_put_session(cs->session);
 	}
 	return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 508941c23af..4514ebbee4d 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
  */
 enum {
 	NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
 	NFSD_Svc,
 	NFSD_Add,
 	NFSD_Del,
@@ -29,6 +30,7 @@ enum {
 	NFSD_Unexport,
 	NFSD_Getfd,
 	NFSD_Getfs,
+#endif
 	NFSD_List,
 	NFSD_Export_features,
 	NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
 /*
  * write() for these nodes.
  */
+#ifdef CONFIG_NFSD_DEPRECATED
 static ssize_t write_svc(struct file *file, char *buf, size_t size);
 static ssize_t write_add(struct file *file, char *buf, size_t size);
 static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
 static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
 	[NFSD_Svc] = write_svc,
 	[NFSD_Add] = write_add,
 	[NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Unexport] = write_unexport,
 	[NFSD_Getfd] = write_getfd,
 	[NFSD_Getfs] = write_getfs,
+#endif
 	[NFSD_Fh] = write_filehandle,
 	[NFSD_FO_UnlockIP] = write_unlock_ip,
 	[NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+	static int warned;
+	if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+		printk(KERN_INFO
+		       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+		       "  This will be removed in 2.6.40\n",
+		       current->comm, file->f_dentry->d_name.name);
+		warned = 1;
+	}
 	if (! file->private_data) {
 		/* An attempt to read a transaction file without writing
 		 * causes a 0-byte write so that the file can return
@@ -137,6 +151,7 @@ static const struct file_operations transaction_ops = {
 	.write		= nfsctl_transaction_write,
 	.read		= nfsctl_transaction_read,
 	.release	= simple_transaction_release,
+	.llseek		= default_llseek,
 };
 
 static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
  * payload - write methods
  */
 
+#ifdef CONFIG_NFSD_DEPRECATED
 /**
  * write_svc - Start kernel's NFSD server
  *
@@ -401,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&in6);
+	clp = auth_unix_lookup(&init_net, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
@@ -464,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 
 	ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
 
-	clp = auth_unix_lookup(&in6);
+	clp = auth_unix_lookup(&init_net, &in6);
 	if (!clp)
 		err = -EPERM;
 	else {
@@ -481,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
  out:
 	return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 
 /**
  * write_unlock_ip - Release all locks used by a client
@@ -949,15 +966,12 @@ static ssize_t __write_ports_addfd(char *buf)
 	if (err != 0)
 		return err;
 
-	err = lockd_up();
-	if (err != 0)
-		goto out;
-
 	err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
-	if (err < 0)
-		lockd_down();
+	if (err < 0) {
+		svc_destroy(nfsd_serv);
+		return err;
+	}
 
-out:
 	/* Decrease the count, but don't shut down the service */
 	nfsd_serv->sv_nrthreads--;
 	return err;
@@ -978,9 +992,6 @@ static ssize_t __write_ports_delfd(char *buf)
 	if (nfsd_serv != NULL)
 		len = svc_sock_names(nfsd_serv, buf,
 					SIMPLE_TRANSACTION_LIMIT, toclose);
-	if (len >= 0)
-		lockd_down();
-
 	kfree(toclose);
 	return len;
 }
@@ -1005,15 +1016,18 @@ static ssize_t __write_ports_addxprt(char *buf)
 	if (err != 0)
 		return err;
 
-	err = svc_create_xprt(nfsd_serv, transport,
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
 				PF_INET, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0)
 		goto out_err;
 
-	err = svc_create_xprt(nfsd_serv, transport,
+	err = svc_create_xprt(nfsd_serv, transport, &init_net,
 				PF_INET6, port, SVC_SOCK_ANONYMOUS);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
+
+	/* Decrease the count, but don't shut down the service */
+	nfsd_serv->sv_nrthreads--;
 	return 0;
 out_close:
 	xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1022,8 +1036,7 @@ out_close:
 		svc_xprt_put(xprt);
 	}
 out_err:
-	/* Decrease the count, but don't shut down the service */
-	nfsd_serv->sv_nrthreads--;
+	svc_destroy(nfsd_serv);
 	return err;
 }
 
@@ -1194,7 +1207,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 			bsize = NFSSVC_MAXBLKSIZE;
 		bsize &= ~(1024-1);
 		mutex_lock(&nfsd_mutex);
-		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+		if (nfsd_serv) {
 			mutex_unlock(&nfsd_mutex);
 			return -EBUSY;
 		}
@@ -1310,6 +1323,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 			return -EINVAL;
 
 		status = nfs4_reset_recoverydir(recdir);
+		if (status)
+			return status;
 	}
 
 	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
@@ -1357,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
 	static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
 		[NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
 		[NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
 		[NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1364,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
 		[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
 		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
 					&export_features_operations, S_IRUGO},
@@ -1388,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static int nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
+	return mount_single(fs_type, flags, data, nfsd_fill_super);
 }
 
 static struct file_system_type nfsd_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfsd",
-	.get_sb		= nfsd_get_sb,
+	.mount		= nfsd_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 72377761270..6b641cf2c19 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -153,6 +153,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_bad_seqid	cpu_to_be32(NFSERR_BAD_SEQID)
 #define	nfserr_symlink		cpu_to_be32(NFSERR_SYMLINK)
 #define	nfserr_not_same		cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range	cpu_to_be32(NFSERR_LOCK_RANGE)
 #define	nfserr_restorefh	cpu_to_be32(NFSERR_RESTOREFH)
 #define	nfserr_attrnotsupp	cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define	nfserr_bad_xdr		cpu_to_be32(NFSERR_BAD_XDR)
@@ -248,7 +249,7 @@ extern time_t nfsd4_grace;
 #define	COMPOUND_SLACK_SPACE		140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE	12     /* OP_SETATTR */
 
-#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a420..c16f8d8331b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
 static inline void
 fh_unlock(struct svc_fh *fhp)
 {
-	BUG_ON(!fhp->fh_dentry);
-
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
 		mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111e..08e17264784 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 	svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
-	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
+	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
 				  &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 	 * gospel of sun micro
 	 */
 	if (type != S_IFREG) {
-		int	is_borc = 0;
 		if (type != S_IFBLK && type != S_IFCHR) {
 			rdev = 0;
 		} else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 			type = S_IFIFO;
 		} else {
 			/* Okay, char or block special */
-			is_borc = 1;
 			if (!rdev)
 				rdev = wanted;
 		}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 06b2a26edfe..2bae1d86f5f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
@@ -180,15 +181,80 @@ int nfsd_nrthreads(void)
 	return rv;
 }
 
+static int nfsd_init_socks(int port)
+{
+	int error;
+	if (!list_empty(&nfsd_serv->sv_permsocks))
+		return 0;
+
+	error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
+					SVC_SOCK_DEFAULTS);
+	if (error < 0)
+		return error;
+
+	return 0;
+}
+
+static bool nfsd_up = false;
+
+static int nfsd_startup(unsigned short port, int nrservs)
+{
+	int ret;
+
+	if (nfsd_up)
+		return 0;
+	/*
+	 * Readahead param cache - will no-op if it already exists.
+	 * (Note therefore results will be suboptimal if number of
+	 * threads is modified after nfsd start.)
+	 */
+	ret = nfsd_racache_init(2*nrservs);
+	if (ret)
+		return ret;
+	ret = nfsd_init_socks(port);
+	if (ret)
+		goto out_racache;
+	ret = lockd_up();
+	if (ret)
+		goto out_racache;
+	ret = nfs4_state_start();
+	if (ret)
+		goto out_lockd;
+	nfsd_up = true;
+	return 0;
+out_lockd:
+	lockd_down();
+out_racache:
+	nfsd_racache_shutdown();
+	return ret;
+}
+
+static void nfsd_shutdown(void)
+{
+	/*
+	 * write_ports can create the server without actually starting
+	 * any threads--if we get shut down before any threads are
+	 * started, then nfsd_last_thread will be run before any of this
+	 * other initialization has been done.
+	 */
+	if (!nfsd_up)
+		return;
+	nfs4_state_shutdown();
+	lockd_down();
+	nfsd_racache_shutdown();
+	nfsd_up = false;
+}
+
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
-	struct svc_xprt *xprt;
-	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
-		lockd_down();
 	nfsd_serv = NULL;
-	nfsd_racache_shutdown();
-	nfs4_state_shutdown();
+	nfsd_shutdown();
 
 	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
 			    "cache\n");
@@ -263,45 +329,18 @@ int nfsd_create_serv(void)
 		       nfsd_max_blksize >= 8*1024*2)
 			nfsd_max_blksize /= 2;
 	}
+	nfsd_reset_versions();
 
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
-		err = -ENOMEM;
-	else
-		set_max_drc();
+		return -ENOMEM;
 
+	set_max_drc();
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
 }
 
-static int nfsd_init_socks(int port)
-{
-	int error;
-	if (!list_empty(&nfsd_serv->sv_permsocks))
-		return 0;
-
-	error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
-					SVC_SOCK_DEFAULTS);
-	if (error < 0)
-		return error;
-
-	error = lockd_up();
-	if (error < 0)
-		return error;
-
-	error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
-					SVC_SOCK_DEFAULTS);
-	if (error < 0)
-		return error;
-
-	error = lockd_up();
-	if (error < 0)
-		return error;
-
-	return 0;
-}
-
 int nfsd_nrpools(void)
 {
 	if (nfsd_serv == NULL)
@@ -376,10 +415,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 	return err;
 }
 
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
 int
 nfsd_svc(unsigned short port, int nrservs)
 {
 	int	error;
+	bool	nfsd_up_before;
 
 	mutex_lock(&nfsd_mutex);
 	dprintk("nfsd: creating service\n");
@@ -391,34 +436,29 @@ nfsd_svc(unsigned short port, int nrservs)
 	if (nrservs == 0 && nfsd_serv == NULL)
 		goto out;
 
-	/* Readahead param cache - will no-op if it already exists */
-	error =	nfsd_racache_init(2*nrservs);
-	if (error<0)
-		goto out;
-	error = nfs4_state_start();
+	error = nfsd_create_serv();
 	if (error)
 		goto out;
 
-	nfsd_reset_versions();
-
-	error = nfsd_create_serv();
+	nfsd_up_before = nfsd_up;
 
+	error = nfsd_startup(port, nrservs);
 	if (error)
-		goto out;
-	error = nfsd_init_socks(port);
-	if (error)
-		goto failure;
-
+		goto out_destroy;
 	error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
-	if (error == 0)
-		/* We are holding a reference to nfsd_serv which
-		 * we don't want to count in the return value,
-		 * so subtract 1
-		 */
-		error = nfsd_serv->sv_nrthreads - 1;
- failure:
+	if (error)
+		goto out_shutdown;
+	/* We are holding a reference to nfsd_serv which
+	 * we don't want to count in the return value,
+	 * so subtract 1
+	 */
+	error = nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+	if (error < 0 && !nfsd_up_before)
+		nfsd_shutdown();
+out_destroy:
 	svc_destroy(nfsd_serv);		/* Release server */
- out:
+out:
 	mutex_unlock(&nfsd_mutex);
 	return error;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 006c84230c7..39adc27b068 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
 
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
 
@@ -64,19 +65,12 @@ typedef struct {
 	(s)->si_fileid, \
 	(s)->si_generation
 
-struct nfsd4_cb_sequence {
-	/* args/res */
-	u32			cbs_minorversion;
-	struct nfs4_client	*cbs_clp;
-};
-
-struct nfs4_rpc_args {
-	void				*args_op;
-	struct nfsd4_cb_sequence	args_seq;
-};
-
 struct nfsd4_callback {
-	struct nfs4_rpc_args cb_args;
+	void *cb_op;
+	struct nfs4_client *cb_clp;
+	u32 cb_minorversion;
+	struct rpc_message cb_msg;
+	const struct rpc_call_ops *cb_ops;
 	struct work_struct cb_work;
 };
 
@@ -88,11 +82,9 @@ struct nfs4_delegation {
 	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
 	struct file_lock	*dl_flock;
-	struct file		*dl_vfs_file;
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
-	u32			dl_ident;
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
@@ -104,8 +96,8 @@ struct nfs4_cb_conn {
 	/* SETCLIENTID info */
 	struct sockaddr_storage	cb_addr;
 	size_t			cb_addrlen;
-	u32                     cb_prog;
-	u32			cb_minorversion;
+	u32                     cb_prog; /* used only in 4.0 case;
+					    per-session otherwise */
 	u32                     cb_ident;	/* minorversion 0 only */
 	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
 };
@@ -161,6 +153,15 @@ struct nfsd4_clid_slot {
 	struct nfsd4_create_session	sl_cr_ses;
 };
 
+struct nfsd4_conn {
+	struct list_head cn_persession;
+	struct svc_xprt *cn_xprt;
+	struct svc_xpt_user cn_xpt_user;
+	struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+	unsigned char cn_flags;
+};
+
 struct nfsd4_session {
 	struct kref		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
@@ -170,6 +171,9 @@ struct nfsd4_session {
 	struct nfs4_sessionid	se_sessionid;
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
+	struct list_head	se_conns;
+	u32			se_cb_prog;
+	u32			se_cb_seq_nr;
 	struct nfsd4_slot	*se_slots[];	/* forward channel slots */
 };
 
@@ -222,24 +226,32 @@ struct nfs4_client {
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_firststate;	/* recovery dir creation */
+	u32			cl_minorversion;
 
 	/* for v4.0 and v4.1 callbacks: */
 	struct nfs4_cb_conn	cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE	1
+#define NFSD4_CLIENT_KILL	2
+	unsigned long		cl_cb_flags;
 	struct rpc_clnt		*cl_cb_client;
+	u32			cl_cb_ident;
 	atomic_t		cl_cb_set;
+	struct nfsd4_callback	cl_cb_null;
+	struct nfsd4_session	*cl_cb_session;
+
+	/* for all client information that callback code might need: */
+	spinlock_t		cl_lock;
 
 	/* for nfs41 */
 	struct list_head	cl_sessions;
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
-	struct nfs4_sessionid	cl_sessionid;
 	/* number of rpc's in progress over an associated session: */
 	atomic_t		cl_refcount;
 
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
-	u32			cl_cb_seq_nr;
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
 };
@@ -342,12 +354,50 @@ struct nfs4_file {
 	struct list_head        fi_hash;    /* hash by "struct inode *" */
 	struct list_head        fi_stateids;
 	struct list_head	fi_delegations;
+	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+	struct file *		fi_fds[3];
+	/* One each for O_RDONLY, O_WRONLY: */
+	atomic_t		fi_access[2];
+	/*
+	 * Each open stateid contributes 1 to either fi_readers or
+	 * fi_writers, or both, depending on the open mode.  A
+	 * delegation also takes an fi_readers reference.  Lock
+	 * stateid's take none.
+	 */
+	atomic_t		fi_readers;
+	atomic_t		fi_writers;
 	struct inode		*fi_inode;
 	u32                     fi_id;      /* used with stateowner->so_id 
 					     * for stateid_hashtbl hash */
 	bool			fi_had_conflict;
 };
 
+/* XXX: for first cut may fall back on returning file that doesn't work
+ * at all? */
+static inline struct file *find_writeable_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	return f->fi_fds[O_RDWR];
+}
+
+static inline struct file *find_readable_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_RDONLY])
+		return f->fi_fds[O_RDONLY];
+	return f->fi_fds[O_RDWR];
+}
+
+static inline struct file *find_any_file(struct nfs4_file *f)
+{
+	if (f->fi_fds[O_RDWR])
+		return f->fi_fds[O_RDWR];
+	else if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	else
+		return f->fi_fds[O_RDONLY];
+}
+
 /*
 * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
 *
@@ -373,7 +423,6 @@ struct nfs4_stateid {
 	struct nfs4_stateowner      * st_stateowner;
 	struct nfs4_file            * st_file;
 	stateid_t                     st_stateid;
-	struct file                 * st_vfs_file;
 	unsigned long                 st_access_bmap;
 	unsigned long                 st_deny_bmap;
 	struct nfs4_stateid         * st_openstp;
@@ -404,12 +453,13 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
-extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b61..184938fcff0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
 {
 	struct inode *inode = fhp->fh_dentry->d_inode;
 	const struct export_operations *export_ops = inode->i_sb->s_export_op;
-	int error = 0;
 
 	if (!EX_ISSYNC(fhp->fh_export))
 		return 0;
 
-	if (export_ops->commit_metadata) {
-		error = export_ops->commit_metadata(inode);
-	} else {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* metadata only */
-		};
-
-		error = sync_inode(inode, &wbc);
-	}
-
-	return error;
+	if (export_ops->commit_metadata)
+		return export_ops->commit_metadata(inode);
+	return sync_inode_metadata(inode, 1);
 }
 
 /*
@@ -604,7 +594,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	return error;
 }
 
-#endif /* defined(CONFIG_NFS_V4) */
+#endif /* defined(CONFIG_NFSD_V4) */
 
 #ifdef CONFIG_NFSD_V3
 /*
@@ -903,7 +893,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
 	struct inode *inode;
-	struct raparms	*ra;
 	mm_segment_t	oldfs;
 	__be32		err;
 	int		host_err;
@@ -914,12 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
 		goto out;
 
-	/* Get readahead parameters */
-	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-
-	if (ra && ra->p_set)
-		file->f_ra = ra->p_ra;
-
 	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
 		struct splice_desc sd = {
 			.len		= 0,
@@ -937,21 +920,11 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		set_fs(oldfs);
 	}
 
-	/* Write back readahead params */
-	if (ra) {
-		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-		spin_lock(&rab->pb_lock);
-		ra->p_ra = file->f_ra;
-		ra->p_set = 1;
-		ra->p_count--;
-		spin_unlock(&rab->pb_lock);
-	}
-
 	if (host_err >= 0) {
 		nfsdstats.io_read += host_err;
 		*count = host_err;
 		err = 0;
-		fsnotify_access(file->f_path.dentry);
+		fsnotify_access(file);
 	} else 
 		err = nfserrno(host_err);
 out:
@@ -1062,7 +1035,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		goto out_nfserr;
 	*cnt = host_err;
 	nfsdstats.io_write += host_err;
-	fsnotify_modify(file->f_path.dentry);
+	fsnotify_modify(file);
 
 	/* clear setuid/setgid flag after write */
 	if (inode->i_mode & (S_ISUID | S_ISGID))
@@ -1086,8 +1059,45 @@ out:
  * on entry. On return, *count contains the number of bytes actually read.
  * N.B. After this call fhp needs an fh_put
  */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+	loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+	struct file *file;
+	struct inode *inode;
+	struct raparms	*ra;
+	__be32 err;
+
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (err)
+		return err;
+
+	inode = file->f_path.dentry->d_inode;
+
+	/* Get readahead parameters */
+	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+
+	if (ra && ra->p_set)
+		file->f_ra = ra->p_ra;
+
+	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+
+	/* Write back readahead params */
+	if (ra) {
+		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+		spin_lock(&rab->pb_lock);
+		ra->p_ra = file->f_ra;
+		ra->p_set = 1;
+		ra->p_count--;
+		spin_unlock(&rab->pb_lock);
+	}
+
+	nfsd_close(file);
+	return err;
+}
+
+/* As above, but use the provided file descriptor. */
 __be32
-nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		loff_t offset, struct kvec *vec, int vlen,
 		unsigned long *count)
 {
@@ -1099,13 +1109,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		if (err)
 			goto out;
 		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
-		if (err)
-			goto out;
-		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-		nfsd_close(file);
-	}
+	} else /* Note file may still be NULL in NFSv4 special stateid case: */
+		err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
 out:
 	return err;
 }
@@ -1631,7 +1636,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 				char *name, int len, struct svc_fh *tfhp)
 {
 	struct dentry	*ddir, *dnew, *dold;
-	struct inode	*dirp, *dest;
+	struct inode	*dirp;
 	__be32		err;
 	int		host_err;
 
@@ -1659,7 +1664,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		goto out_nfserr;
 
 	dold = tfhp->fh_dentry;
-	dest = dold->d_inode;
 
 	host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
 	if (host_err) {
@@ -2019,9 +2023,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-	if (!err && vfs_statfs(fhp->fh_dentry,stat))
-		err = nfserr_io;
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+	if (!err) {
+		struct path path = {
+			.mnt	= fhp->fh_export->ex_path.mnt,
+			.dentry	= fhp->fh_dentry,
+		};
+		if (vfs_statfs(&path, stat))
+			err = nfserr_io;
+	}
 	return err;
 }
 
@@ -2038,7 +2050,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 					struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
-	struct path	path;
 	int		err;
 
 	if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2122,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
 		err = inode_permission(inode, MAY_EXEC);
-	if (err)
-		goto nfsd_out;
 
-	/* Do integrity (permission) checking now, but defer incrementing
-	 * IMA counts to the actual file open.
-	 */
-	path.mnt = exp->ex_path.mnt;
-	path.dentry = dentry;
-nfsd_out:
 	return err? nfserrno(err) : 0;
 }
 
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 217a62c2a35..9a370a5e36b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -64,7 +64,9 @@ __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, int,
 				int, struct file **);
 void		nfsd_close(struct file *);
-__be32 		nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+__be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
+				loff_t, struct kvec *, int, unsigned long *);
+__be32 		nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
 				loff_t, struct kvec *, int, unsigned long *);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
 				loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc..85c98737a14 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
 nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
 	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
 	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
-	ifile.o alloc.o gcinode.o ioctl.o gcdat.o
+	ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c1..8b782b062ba 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
 #include "nilfs.h"
 #include "bmap.h"
 #include "sb.h"
+#include "btree.h"
+#include "direct.h"
 #include "btnode.h"
 #include "mdt.h"
 #include "dat.h"
@@ -531,18 +533,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
 	nilfs_btree_init_gc(bmap);
 }
 
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+		     struct nilfs_bmap_store *store)
 {
-	memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
-	init_rwsem(&gcbmap->b_sem);
-	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
-	gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+	memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
+	store->last_allocated_key = bmap->b_last_allocated_key;
+	store->last_allocated_ptr = bmap->b_last_allocated_ptr;
+	store->state = bmap->b_state;
 }
 
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+			const struct nilfs_bmap_store *store)
 {
-	memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
-	init_rwsem(&bmap->b_sem);
-	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
-	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
+	bmap->b_last_allocated_key = store->last_allocated_key;
+	bmap->b_last_allocated_ptr = store->last_allocated_ptr;
+	bmap->b_state = store->state;
 }
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab9..bde1c0aa2e1 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
 
 #define NILFS_BMAP_INVALID_PTR	0
 
-#define nilfs_bmap_dkey_to_key(dkey)	le64_to_cpu(dkey)
-#define nilfs_bmap_key_to_dkey(key)	cpu_to_le64(key)
-#define nilfs_bmap_dptr_to_ptr(dptr)	le64_to_cpu(dptr)
-#define nilfs_bmap_ptr_to_dptr(ptr)	cpu_to_le64(ptr)
-
 #define nilfs_bmap_keydiff_abs(diff)	((diff) < 0 ? -(diff) : (diff))
 
 
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
 	int (*bop_delete)(struct nilfs_bmap *, __u64);
 	void (*bop_clear)(struct nilfs_bmap *);
 
-	int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+	int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
 	void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
 					 struct list_head *);
 
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
  * @b_last_allocated_ptr: last allocated ptr for data block
  * @b_ptr_type: pointer type
  * @b_state: state
+ * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
  */
 struct nilfs_bmap {
 	union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
 	__u64 b_last_allocated_ptr;
 	int b_ptr_type;
 	int b_state;
+	__u16 b_nchildren_per_block;
 };
 
 /* pointer type */
@@ -138,6 +135,12 @@ struct nilfs_bmap {
 /* state */
 #define NILFS_BMAP_DIRTY	0x00000001
 
+struct nilfs_bmap_store {
+	__le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	__u64 last_allocated_key;
+	__u64 last_allocated_ptr;
+	int state;
+};
 
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -156,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
 int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
 
 void nilfs_bmap_init_gc(struct nilfs_bmap *);
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
 
 static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
 				    __u64 *ptr)
@@ -224,6 +227,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
 		nilfs_dat_abort_end(dat, &req->bpr_req);
 }
 
+static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
+					   __u64 ptr)
+{
+	bmap->b_last_allocated_key = key;
+	bmap->b_last_allocated_ptr = ptr;
+}
+
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 			      const struct buffer_head *);
 
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47..00000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * bmap_union.h - NILFS block mapping.
- *
- * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- */
-
-#ifndef _NILFS_BMAP_UNION_H
-#define _NILFS_BMAP_UNION_H
-
-#include "bmap.h"
-#include "direct.h"
-#include "btree.h"
-
-/**
- * nilfs_bmap_union -
- * @bi_bmap: bmap structure
- * @bi_btree: direct map structure
- * @bi_direct: B-tree structure
- */
-union nilfs_bmap_union {
-	struct nilfs_bmap bi_bmap;
-	struct nilfs_direct bi_direct;
-	struct nilfs_btree bi_btree;
-};
-
-#endif	/* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a330..5115814cb74 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
 
 void nilfs_btnode_cache_init_once(struct address_space *btnc)
 {
-	memset(btnc, 0, sizeof(*btnc));
-	INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
-	spin_lock_init(&btnc->tree_lock);
-	INIT_LIST_HEAD(&btnc->private_list);
-	spin_lock_init(&btnc->private_lock);
-
-	spin_lock_init(&btnc->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
-	INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+	nilfs_mapping_init_once(btnc);
 }
 
 static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
 void nilfs_btnode_cache_init(struct address_space *btnc,
 			     struct backing_dev_info *bdi)
 {
-	btnc->host = NULL;  /* can safely set to host inode ? */
-	btnc->flags = 0;
-	mapping_set_gfp_mask(btnc, GFP_NOFS);
-	btnc->assoc_mapping = NULL;
-	btnc->backing_dev_info = bdi;
-	btnc->a_ops = &def_btnode_aops;
+	nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
 }
 
 void nilfs_btnode_cache_clear(struct address_space *btnc)
@@ -96,10 +83,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 }
 
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-			      sector_t pblocknr, struct buffer_head **pbh)
+			      sector_t pblocknr, int mode,
+			      struct buffer_head **pbh, sector_t *submit_ptr)
 {
 	struct buffer_head *bh;
 	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct page *page;
 	int err;
 
 	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +96,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 		return -ENOMEM;
 
 	err = -EEXIST; /* internal code */
+	page = bh->b_page;
 
 	if (buffer_uptodate(bh) || buffer_dirty(bh))
 		goto found;
@@ -125,7 +115,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 			}
 		}
 	}
-	lock_buffer(bh);
+
+	if (mode == READA) {
+		if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
+			err = -EBUSY; /* internal code */
+			brelse(bh);
+			goto out_locked;
+		}
+	} else { /* mode == READ */
+		lock_buffer(bh);
+	}
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
 		err = -EEXIST; /* internal code */
@@ -136,15 +135,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
 	bh->b_blocknr = pblocknr; /* set block address for read */
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(READ, bh);
+	submit_bh(mode, bh);
 	bh->b_blocknr = blocknr; /* set back to the given block address */
+	*submit_ptr = pblocknr;
 	err = 0;
 found:
 	*pbh = bh;
 
 out_locked:
-	unlock_page(bh->b_page);
-	page_cache_release(bh->b_page);
+	unlock_page(page);
+	page_cache_release(page);
 	return err;
 }
 
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f0771..79037494f1e 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
 					      __u64 blocknr);
-int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
-			      struct buffer_head **);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
+			      struct buffer_head **, sector_t *);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
 				    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b27a342c5af..300c2bc00c3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 /*
  * B-tree node operations
  */
-static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
-				 struct buffer_head **bhp)
-{
-	struct address_space *btnc =
-		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-	int err;
-
-	err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
-	if (err)
-		return err == -EEXIST ? 0 : err;
-
-	wait_on_buffer(*bhp);
-	if (!buffer_uptodate(*bhp)) {
-		brelse(*bhp);
-		return -EIO;
-	}
-	return 0;
-}
-
-static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
+static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
 				     __u64 ptr, struct buffer_head **bhp)
 {
-	struct address_space *btnc =
-		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
 	struct buffer_head *bh;
 
 	bh = nilfs_btnode_create_block(btnc, ptr);
@@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 	return 0;
 }
 
-static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
 {
 	return node->bn_flags;
 }
 
-static inline void
+static void
 nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
 {
 	node->bn_flags = flags;
 }
 
-static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
 {
 	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
 
-static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
 {
 	return node->bn_level;
 }
 
-static inline void
+static void
 nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
 {
 	node->bn_level = level;
 }
 
-static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
 {
 	return le16_to_cpu(node->bn_nchildren);
 }
 
-static inline void
+static void
 nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 {
 	node->bn_nchildren = cpu_to_le16(nchildren);
 }
 
-static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
+static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
 {
-	return 1 << btree->bt_bmap.b_inode->i_blkbits;
+	return 1 << btree->b_inode->i_blkbits;
 }
 
-static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
-			       const struct nilfs_btree *btree)
+static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
 {
-	return nilfs_btree_node_root(node) ?
-		NILFS_BTREE_ROOT_NCHILDREN_MIN :
-		NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+	return btree->b_nchildren_per_block;
 }
 
-static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
-			       const struct nilfs_btree *btree)
-{
-	return nilfs_btree_node_root(node) ?
-		NILFS_BTREE_ROOT_NCHILDREN_MAX :
-		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
-}
-
-static inline __le64 *
+static __le64 *
 nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 {
 	return (__le64 *)((char *)(node + 1) +
@@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
 
-static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
-		       const struct nilfs_btree *btree)
+static __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
 {
-	return (__le64 *)(nilfs_btree_node_dkeys(node) +
-			  nilfs_btree_node_nchildren_max(node, btree));
+	return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
 }
 
-static inline __u64
+static __u64
 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
 {
-	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
+	return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
 }
 
-static inline void
+static void
 nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
 {
-	*(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
+	*(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
 }
 
-static inline __u64
-nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_node *node, int index)
+static __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
+			 int ncmax)
 {
-	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
-					index));
+	return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
 }
 
-static inline void
-nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
-			 struct nilfs_btree_node *node, int index, __u64 ptr)
+static void
+nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
+			 int ncmax)
 {
-	*(nilfs_btree_node_dptrs(node, btree) + index) =
-		nilfs_bmap_ptr_to_dptr(ptr);
+	*(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
 }
 
-static void nilfs_btree_node_init(struct nilfs_btree *btree,
-				  struct nilfs_btree_node *node,
-				  int flags, int level, int nchildren,
+static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
+				  int level, int nchildren, int ncmax,
 				  const __u64 *keys, const __u64 *ptrs)
 {
 	__le64 *dkeys;
@@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
 	nilfs_btree_node_set_nchildren(node, nchildren);
 
 	dkeys = nilfs_btree_node_dkeys(node);
-	dptrs = nilfs_btree_node_dptrs(node, btree);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
 	for (i = 0; i < nchildren; i++) {
-		dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
-		dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+		dkeys[i] = cpu_to_le64(keys[i]);
+		dptrs[i] = cpu_to_le64(ptrs[i]);
 	}
 }
 
 /* Assume the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
-				       struct nilfs_btree_node *left,
+static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
 				       struct nilfs_btree_node *right,
-				       int n)
+				       int n, int lncmax, int rncmax)
 {
 	__le64 *ldkeys, *rdkeys;
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
 	ldkeys = nilfs_btree_node_dkeys(left);
-	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
 	lnchildren = nilfs_btree_node_get_nchildren(left);
 
 	rdkeys = nilfs_btree_node_dkeys(right);
-	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
 	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 }
 
 /* Assume that the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
-					struct nilfs_btree_node *left,
+static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
 					struct nilfs_btree_node *right,
-					int n)
+					int n, int lncmax, int rncmax)
 {
 	__le64 *ldkeys, *rdkeys;
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
 	ldkeys = nilfs_btree_node_dkeys(left);
-	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	ldptrs = nilfs_btree_node_dptrs(left, lncmax);
 	lnchildren = nilfs_btree_node_get_nchildren(left);
 
 	rdkeys = nilfs_btree_node_dkeys(right);
-	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rdptrs = nilfs_btree_node_dptrs(right, rncmax);
 	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_insert(struct nilfs_btree *btree,
-				    struct nilfs_btree_node *node,
-				    __u64 key, __u64 ptr, int index)
+static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
+				    __u64 key, __u64 ptr, int ncmax)
 {
 	__le64 *dkeys;
 	__le64 *dptrs;
 	int nchildren;
 
 	dkeys = nilfs_btree_node_dkeys(node);
-	dptrs = nilfs_btree_node_dptrs(node, btree);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (index < nchildren) {
 		memmove(dkeys + index + 1, dkeys + index,
@@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
 		memmove(dptrs + index + 1, dptrs + index,
 			(nchildren - index) * sizeof(*dptrs));
 	}
-	dkeys[index] = nilfs_bmap_key_to_dkey(key);
-	dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+	dkeys[index] = cpu_to_le64(key);
+	dptrs[index] = cpu_to_le64(ptr);
 	nchildren++;
 	nilfs_btree_node_set_nchildren(node, nchildren);
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_delete(struct nilfs_btree *btree,
-				    struct nilfs_btree_node *node,
-				    __u64 *keyp, __u64 *ptrp, int index)
+static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
+				    __u64 *keyp, __u64 *ptrp, int ncmax)
 {
 	__u64 key;
 	__u64 ptr;
@@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
 	int nchildren;
 
 	dkeys = nilfs_btree_node_dkeys(node);
-	dptrs = nilfs_btree_node_dptrs(node, btree);
-	key = nilfs_bmap_dkey_to_key(dkeys[index]);
-	ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
+	key = le64_to_cpu(dkeys[index]);
+	ptr = le64_to_cpu(dptrs[index]);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (keyp != NULL)
 		*keyp = key;
@@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
 	return s == 0;
 }
 
-static inline struct nilfs_btree_node *
-nilfs_btree_get_root(const struct nilfs_btree *btree)
+/**
+ * nilfs_btree_node_broken - verify consistency of btree node
+ * @node: btree node block to be examined
+ * @size: node size (in bytes)
+ * @blocknr: block number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
+				   size_t size, sector_t blocknr)
 {
-	return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+	int level, flags, nchildren;
+	int ret = 0;
+
+	level = nilfs_btree_node_get_level(node);
+	flags = nilfs_btree_node_get_flags(node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+
+	if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+		     level >= NILFS_BTREE_LEVEL_MAX ||
+		     (flags & NILFS_BTREE_NODE_ROOT) ||
+		     nchildren < 0 ||
+		     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
+		printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
+		       "level = %d, flags = 0x%x, nchildren = %d\n",
+		       (unsigned long long)blocknr, level, flags, nchildren);
+		ret = 1;
+	}
+	return ret;
 }
 
-static inline struct nilfs_btree_node *
+int nilfs_btree_broken_node_block(struct buffer_head *bh)
+{
+	int ret;
+
+	if (buffer_nilfs_checked(bh))
+		return 0;
+
+	ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
+				       bh->b_size, bh->b_blocknr);
+	if (likely(!ret))
+		set_buffer_nilfs_checked(bh);
+	return ret;
+}
+
+static struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_bmap *btree)
+{
+	return (struct nilfs_btree_node *)btree->b_u.u_data;
+}
+
+static struct nilfs_btree_node *
 nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
 
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
 nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
 
-static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+static int nilfs_btree_height(const struct nilfs_bmap *btree)
 {
 	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
 }
 
-static inline struct nilfs_btree_node *
-nilfs_btree_get_node(const struct nilfs_btree *btree,
+static struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_bmap *btree,
 		     const struct nilfs_btree_path *path,
-		     int level)
+		     int level, int *ncmaxp)
 {
-	return (level == nilfs_btree_height(btree) - 1) ?
-		nilfs_btree_get_root(btree) :
-		nilfs_btree_get_nonroot_node(path, level);
+	struct nilfs_btree_node *node;
+
+	if (level == nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_root(btree);
+		*ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+	} else {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		*ncmaxp = nilfs_btree_nchildren_per_block(btree);
+	}
+	return node;
 }
 
-static inline int
+static int
 nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
 {
 	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
 	return 0;
 }
 
-static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+struct nilfs_btree_readahead_info {
+	struct nilfs_btree_node *node;	/* parent node */
+	int max_ra_blocks;		/* max nof blocks to read ahead */
+	int index;			/* current index on the parent node */
+	int ncmax;			/* nof children in the parent node */
+};
+
+static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp,
+				   const struct nilfs_btree_readahead_info *ra)
+{
+	struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+	struct buffer_head *bh, *ra_bh;
+	sector_t submit_ptr = 0;
+	int ret;
+
+	ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+	if (ret) {
+		if (ret != -EEXIST)
+			return ret;
+		goto out_check;
+	}
+
+	if (ra) {
+		int i, n;
+		__u64 ptr2;
+
+		/* read ahead sibling nodes */
+		for (n = ra->max_ra_blocks, i = ra->index + 1;
+		     n > 0 && i < ra->ncmax; n--, i++) {
+			ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
+
+			ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+							&ra_bh, &submit_ptr);
+			if (likely(!ret || ret == -EEXIST))
+				brelse(ra_bh);
+			else if (ret != -EBUSY)
+				break;
+			if (!buffer_locked(bh))
+				goto out_no_wait;
+		}
+	}
+
+	wait_on_buffer(bh);
+
+ out_no_wait:
+	if (!buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EIO;
+	}
+
+ out_check:
+	if (nilfs_btree_broken_node_block(bh)) {
+		clear_buffer_uptodate(bh);
+		brelse(bh);
+		return -EINVAL;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+				   struct buffer_head **bhp)
+{
+	return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
+}
+
+static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
 				 struct nilfs_btree_path *path,
-				 __u64 key, __u64 *ptrp, int minlevel)
+				 __u64 key, __u64 *ptrp, int minlevel,
+				 int readahead)
 {
 	struct nilfs_btree_node *node;
+	struct nilfs_btree_readahead_info p, *ra;
 	__u64 ptr;
-	int level, index, found, ret;
+	int level, index, found, ncmax, ret;
 
 	node = nilfs_btree_get_root(btree);
 	level = nilfs_btree_node_get_level(node);
@@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 		return -ENOENT;
 
 	found = nilfs_btree_node_lookup(node, key, &index);
-	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
 
-	for (level--; level >= minlevel; level--) {
-		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
+	ncmax = nilfs_btree_nchildren_per_block(btree);
+
+	while (--level >= minlevel) {
+		ra = NULL;
+		if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
+			p.node = nilfs_btree_get_node(btree, path, level + 1,
+						      &p.ncmax);
+			p.index = index;
+			p.max_ra_blocks = 7;
+			ra = &p;
+		}
+		ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
+					      ra);
 		if (ret < 0)
 			return ret;
+
 		node = nilfs_btree_get_nonroot_node(path, level);
 		if (nilfs_btree_bad_node(node, level))
 			return -EINVAL;
@@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 			found = nilfs_btree_node_lookup(node, key, &index);
 		else
 			index = 0;
-		if (index < nilfs_btree_node_nchildren_max(node, btree))
-			ptr = nilfs_btree_node_get_ptr(btree, node, index);
-		else {
+		if (index < ncmax) {
+			ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
+		} else {
 			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
 			/* insert */
 			ptr = NILFS_BMAP_INVALID_PTR;
@@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 	return 0;
 }
 
-static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
 				      struct nilfs_btree_path *path,
 				      __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node;
 	__u64 ptr;
-	int index, level, ret;
+	int index, level, ncmax, ret;
 
 	node = nilfs_btree_get_root(btree);
 	index = nilfs_btree_node_get_nchildren(node) - 1;
 	if (index < 0)
 		return -ENOENT;
 	level = nilfs_btree_node_get_level(node);
-	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	ptr = nilfs_btree_node_get_ptr(node, index,
+				       NILFS_BTREE_ROOT_NCHILDREN_MAX);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
+	ncmax = nilfs_btree_nchildren_per_block(btree);
 
 	for (level--; level > 0; level--) {
 		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 		if (nilfs_btree_bad_node(node, level))
 			return -EINVAL;
 		index = nilfs_btree_node_get_nchildren(node) - 1;
-		ptr = nilfs_btree_node_get_ptr(btree, node, index);
+		ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
 		path[level].bp_index = index;
 	}
 
@@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 	return 0;
 }
 
-static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
 			      __u64 key, int level, __u64 *ptrp)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
-	__u64 ptr;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
 
-	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
-
-	if (ptrp != NULL)
-		*ptrp = ptr;
+	ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
 
 	nilfs_btree_free_path(path);
 
 	return ret;
 }
 
-static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
 				     __u64 key, __u64 *ptrp, unsigned maxblocks)
 {
-	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
 	struct nilfs_btree_path *path;
 	struct nilfs_btree_node *node;
 	struct inode *dat = NULL;
 	__u64 ptr, ptr2;
 	sector_t blocknr;
 	int level = NILFS_BTREE_LEVEL_NODE_MIN;
-	int ret, cnt, index, maxlevel;
+	int ret, cnt, index, maxlevel, ncmax;
+	struct nilfs_btree_readahead_info p;
 
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
 
-	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
 	if (ret < 0)
 		goto out;
 
-	if (NILFS_BMAP_USE_VBN(bmap)) {
-		dat = nilfs_bmap_get_dat(bmap);
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		dat = nilfs_bmap_get_dat(btree);
 		ret = nilfs_dat_translate(dat, ptr, &blocknr);
 		if (ret < 0)
 			goto out;
@@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 		goto end;
 
 	maxlevel = nilfs_btree_height(btree) - 1;
-	node = nilfs_btree_get_node(btree, path, level);
+	node = nilfs_btree_get_node(btree, path, level, &ncmax);
 	index = path[level].bp_index + 1;
 	for (;;) {
 		while (index < nilfs_btree_node_get_nchildren(node)) {
 			if (nilfs_btree_node_get_key(node, index) !=
 			    key + cnt)
 				goto end;
-			ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+			ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
 			if (dat) {
 				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
 				if (ret < 0)
@@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 			break;
 
 		/* look-up right sibling node */
-		node = nilfs_btree_get_node(btree, path, level + 1);
-		index = path[level + 1].bp_index + 1;
-		if (index >= nilfs_btree_node_get_nchildren(node) ||
-		    nilfs_btree_node_get_key(node, index) != key + cnt)
+		p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
+		p.index = path[level + 1].bp_index + 1;
+		p.max_ra_blocks = 7;
+		if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
+		    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
 			break;
-		ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
-		path[level + 1].bp_index = index;
+		ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
+		path[level + 1].bp_index = p.index;
 
 		brelse(path[level].bp_bh);
 		path[level].bp_bh = NULL;
-		ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+
+		ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
+					      &p);
 		if (ret < 0)
 			goto out;
 		node = nilfs_btree_get_nonroot_node(path, level);
+		ncmax = nilfs_btree_nchildren_per_block(btree);
 		index = 0;
 		path[level].bp_index = index;
 	}
@@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	return ret;
 }
 
-static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
 				    struct nilfs_btree_path *path,
 				    int level, __u64 key)
 {
@@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 	}
 }
 
-static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
 				  struct nilfs_btree_path *path,
 				  int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node;
+	int ncblk;
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		node = nilfs_btree_get_nonroot_node(path, level);
-		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
-					path[level].bp_index);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
 
@@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 									 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
-		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
-					path[level].bp_index);
+		nilfs_btree_node_insert(node, path[level].bp_index,
+					*keyp, *ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
 	}
 }
 
-static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
 				   struct nilfs_btree_path *path,
 				   int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *left;
-	int nchildren, lnchildren, n, move;
+	int nchildren, lnchildren, n, move, ncblk;
 
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 	move = 0;
 
 	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 		move = 1;
 	}
 
-	nilfs_btree_node_move_left(btree, left, node, n);
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
 
-static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
 				    struct nilfs_btree_path *path,
 				    int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *right;
-	int nchildren, rnchildren, n, move;
+	int nchildren, rnchildren, n, move, ncblk;
 
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 	move = 0;
 
 	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 		move = 1;
 	}
 
-	nilfs_btree_node_move_right(btree, node, right, n);
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
 
-static void nilfs_btree_split(struct nilfs_btree *btree,
+static void nilfs_btree_split(struct nilfs_bmap *btree,
 			      struct nilfs_btree_path *path,
 			      int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *right;
 	__u64 newkey;
 	__u64 newptr;
-	int nchildren, n, move;
+	int nchildren, n, move, ncblk;
 
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 	move = 0;
 
 	n = (nchildren + 1) / 2;
@@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 		move = 1;
 	}
 
-	nilfs_btree_node_move_right(btree, node, right, n);
+	nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 
 	if (move) {
 		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-		nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
-					path[level].bp_index);
+		nilfs_btree_node_insert(right, path[level].bp_index,
+					*keyp, *ptrp, ncblk);
 
 		*keyp = nilfs_btree_node_get_key(right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
@@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	path[level + 1].bp_index++;
 }
 
-static void nilfs_btree_grow(struct nilfs_btree *btree,
+static void nilfs_btree_grow(struct nilfs_bmap *btree,
 			     struct nilfs_btree_path *path,
 			     int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *root, *child;
-	int n;
+	int n, ncblk;
 
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
 	n = nilfs_btree_node_get_nchildren(root);
 
-	nilfs_btree_node_move_right(btree, root, child, n);
+	nilfs_btree_node_move_right(root, child, n,
+				    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
 	nilfs_btree_node_set_level(root, level + 1);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
@@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 	*ptrp = path[level].bp_newreq.bpr_ptr;
 }
 
-static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
 				   const struct nilfs_btree_path *path)
 {
 	struct nilfs_btree_node *node;
-	int level;
+	int level, ncmax;
 
 	if (path == NULL)
 		return NILFS_BMAP_INVALID_PTR;
@@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
 	/* left sibling */
 	level = NILFS_BTREE_LEVEL_NODE_MIN;
 	if (path[level].bp_index > 0) {
-		node = nilfs_btree_get_node(btree, path, level);
-		return nilfs_btree_node_get_ptr(btree, node,
-						path[level].bp_index - 1);
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node,
+						path[level].bp_index - 1,
+						ncmax);
 	}
 
 	/* parent */
 	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
 	if (level <= nilfs_btree_height(btree) - 1) {
-		node = nilfs_btree_get_node(btree, path, level);
-		return nilfs_btree_node_get_ptr(btree, node,
-						path[level].bp_index);
+		node = nilfs_btree_get_node(btree, path, level, &ncmax);
+		return nilfs_btree_node_get_ptr(node, path[level].bp_index,
+						ncmax);
 	}
 
 	return NILFS_BMAP_INVALID_PTR;
 }
 
-static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
 				       const struct nilfs_btree_path *path,
 				       __u64 key)
 {
 	__u64 ptr;
 
-	ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+	ptr = nilfs_bmap_find_target_seq(btree, key);
 	if (ptr != NILFS_BMAP_INVALID_PTR)
 		/* sequential access */
 		return ptr;
@@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
 			return ptr;
 	}
 	/* block group */
-	return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
-}
-
-static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
-				     __u64 ptr)
-{
-	btree->bt_bmap.b_last_allocated_key = key;
-	btree->bt_bmap.b_last_allocated_ptr = ptr;
+	return nilfs_bmap_find_target_in_group(btree);
 }
 
-static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
 				      struct nilfs_btree_path *path,
 				      int *levelp, __u64 key, __u64 ptr,
 				      struct nilfs_bmap_stats *stats)
@@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	struct buffer_head *bh;
 	struct nilfs_btree_node *node, *parent, *sib;
 	__u64 sibptr;
-	int pindex, level, ret;
+	int pindex, level, ncmax, ncblk, ret;
 	struct inode *dat = NULL;
 
 	stats->bs_nblocks = 0;
 	level = NILFS_BTREE_LEVEL_DATA;
 
 	/* allocate a new ptr for data block */
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
+	if (NILFS_BMAP_USE_VBN(btree)) {
 		path[level].bp_newreq.bpr_ptr =
 			nilfs_btree_find_target_v(btree, path, key);
-		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+		dat = nilfs_bmap_get_dat(btree);
 	}
 
-	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq, dat);
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_data;
 
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
 		node = nilfs_btree_get_nonroot_node(path, level);
-		if (nilfs_btree_node_get_nchildren(node) <
-		    nilfs_btree_node_nchildren_max(node, btree)) {
+		if (nilfs_btree_node_get_nchildren(node) < ncblk) {
 			path[level].bp_op = nilfs_btree_do_insert;
 			stats->bs_nblocks++;
 			goto out;
 		}
 
-		parent = nilfs_btree_get_node(btree, path, level + 1);
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
 		pindex = path[level + 1].bp_index;
 
 		/* left sibling */
 		if (pindex > 0) {
-			sibptr = nilfs_btree_node_get_ptr(btree, parent,
-							  pindex - 1);
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(sib) <
-			    nilfs_btree_node_nchildren_max(sib, btree)) {
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_left;
 				stats->bs_nblocks++;
 				goto out;
-			} else
+			} else {
 				brelse(bh);
+			}
 		}
 
 		/* right sibling */
-		if (pindex <
-		    nilfs_btree_node_get_nchildren(parent) - 1) {
-			sibptr = nilfs_btree_node_get_ptr(btree, parent,
-							  pindex + 1);
+		if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(sib) <
-			    nilfs_btree_node_nchildren_max(sib, btree)) {
+			if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_right;
 				stats->bs_nblocks++;
 				goto out;
-			} else
+			} else {
 				brelse(bh);
+			}
 		}
 
 		/* split */
 		path[level].bp_newreq.bpr_ptr =
 			path[level - 1].bp_newreq.bpr_ptr + 1;
-		ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+		ret = nilfs_bmap_prepare_alloc_ptr(btree,
 						   &path[level].bp_newreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
@@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 		stats->bs_nblocks++;
 
-		nilfs_btree_node_init(btree,
-				      (struct nilfs_btree_node *)bh->b_data,
-				      0, level, 0, NULL, NULL);
+		sib = (struct nilfs_btree_node *)bh->b_data;
+		nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
 		path[level].bp_sib_bh = bh;
 		path[level].bp_op = nilfs_btree_split;
 	}
@@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	/* root */
 	node = nilfs_btree_get_root(btree);
 	if (nilfs_btree_node_get_nchildren(node) <
-	    nilfs_btree_node_nchildren_max(node, btree)) {
+	    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
 		path[level].bp_op = nilfs_btree_do_insert;
 		stats->bs_nblocks++;
 		goto out;
@@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* grow */
 	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq, dat);
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	if (ret < 0)
 		goto err_out_curr_node;
 
-	nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
-			      0, level, 0, NULL, NULL);
+	nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
+			      0, level, 0, ncblk, NULL, NULL);
 	path[level].bp_sib_bh = bh;
 	path[level].bp_op = nilfs_btree_grow;
 
@@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
-				   dat);
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
  err_out_child_node:
 	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
 		nilfs_btnode_delete(path[level].bp_sib_bh);
-		nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq, dat);
+		nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
 
 	}
 
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
-				   dat);
+	nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
  err_out_data:
 	*levelp = level;
 	stats->bs_nblocks = 0;
 	return ret;
 }
 
-static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
 				      struct nilfs_btree_path *path,
 				      int maxlevel, __u64 key, __u64 ptr)
 {
@@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
 
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
-		nilfs_btree_set_target_v(btree, key, ptr);
-		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	if (NILFS_BMAP_USE_VBN(btree)) {
+		nilfs_bmap_set_target_v(btree, key, ptr);
+		dat = nilfs_bmap_get_dat(btree);
 	}
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-		nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
+		nilfs_bmap_commit_alloc_ptr(btree,
 					    &path[level - 1].bp_newreq, dat);
 		path[level].bp_op(btree, path, level, &key, &ptr);
 	}
 
-	if (!nilfs_bmap_dirty(&btree->bt_bmap))
-		nilfs_bmap_set_dirty(&btree->bt_bmap);
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
 }
 
-static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_bmap_stats stats;
 	int level, ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
 
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-				    NILFS_BTREE_LEVEL_NODE_MIN);
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
 	if (ret != -ENOENT) {
 		if (ret == 0)
 			ret = -EEXIST;
@@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	if (ret < 0)
 		goto out;
 	nilfs_btree_commit_insert(btree, path, level, key, ptr);
-	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
 
  out:
 	nilfs_btree_free_path(path);
 	return ret;
 }
 
-static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
 				  struct nilfs_btree_path *path,
 				  int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node;
+	int ncblk;
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		node = nilfs_btree_get_nonroot_node(path, level);
-		nilfs_btree_node_delete(btree, node, keyp, ptrp,
-					path[level].bp_index);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp, ncblk);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
 		if (path[level].bp_index == 0)
@@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 				nilfs_btree_node_get_key(node, 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
-		nilfs_btree_node_delete(btree, node, keyp, ptrp,
-					path[level].bp_index);
+		nilfs_btree_node_delete(node, path[level].bp_index,
+					keyp, ptrp,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
 	}
 }
 
-static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
 				    struct nilfs_btree_path *path,
 				    int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *left;
-	int nchildren, lnchildren, n;
+	int nchildren, lnchildren, n, ncblk;
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
@@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	lnchildren = nilfs_btree_node_get_nchildren(left);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
 	n = (nchildren + lnchildren) / 2 - nchildren;
 
-	nilfs_btree_node_move_right(btree, left, node, n);
+	nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	path[level].bp_index += n;
 }
 
-static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
 				     struct nilfs_btree_path *path,
 				     int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *right;
-	int nchildren, rnchildren, n;
+	int nchildren, rnchildren, n, ncblk;
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
@@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
 	rnchildren = nilfs_btree_node_get_nchildren(right);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
 	n = (nchildren + rnchildren) / 2 - nchildren;
 
-	nilfs_btree_node_move_left(btree, node, right, n);
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 	path[level].bp_sib_bh = NULL;
 }
 
-static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
 				    struct nilfs_btree_path *path,
 				    int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *left;
-	int n;
+	int n, ncblk;
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
 	n = nilfs_btree_node_get_nchildren(node);
 
-	nilfs_btree_node_move_left(btree, left, node, n);
+	nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
 
-static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
 				     struct nilfs_btree_path *path,
 				     int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *node, *right;
-	int n;
+	int n, ncblk;
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
 	n = nilfs_btree_node_get_nchildren(right);
 
-	nilfs_btree_node_move_left(btree, node, right, n);
+	nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
 
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 	path[level + 1].bp_index++;
 }
 
-static void nilfs_btree_shrink(struct nilfs_btree *btree,
+static void nilfs_btree_shrink(struct nilfs_bmap *btree,
 			       struct nilfs_btree_path *path,
 			       int level, __u64 *keyp, __u64 *ptrp)
 {
 	struct nilfs_btree_node *root, *child;
-	int n;
+	int n, ncblk;
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_nonroot_node(path, level);
+	ncblk = nilfs_btree_nchildren_per_block(btree);
 
-	nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+	nilfs_btree_node_delete(root, 0, NULL, NULL,
+				NILFS_BTREE_ROOT_NCHILDREN_MAX);
 	nilfs_btree_node_set_level(root, level);
 	n = nilfs_btree_node_get_nchildren(child);
-	nilfs_btree_node_move_left(btree, root, child, n);
+	nilfs_btree_node_move_left(root, child, n,
+				   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
 
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = NULL;
 }
 
 
-static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
 				      struct nilfs_btree_path *path,
 				      int *levelp,
 				      struct nilfs_bmap_stats *stats,
@@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 	struct buffer_head *bh;
 	struct nilfs_btree_node *node, *parent, *sib;
 	__u64 sibptr;
-	int pindex, level, ret;
+	int pindex, level, ncmin, ncmax, ncblk, ret;
 
 	ret = 0;
 	stats->bs_nblocks = 0;
+	ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+	ncblk = nilfs_btree_nchildren_per_block(btree);
+
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
 		node = nilfs_btree_get_nonroot_node(path, level);
 		path[level].bp_oldreq.bpr_ptr =
-			nilfs_btree_node_get_ptr(btree, node,
-						 path[level].bp_index);
-		ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+			nilfs_btree_node_get_ptr(node, path[level].bp_index,
+						 ncblk);
+		ret = nilfs_bmap_prepare_end_ptr(btree,
 						 &path[level].bp_oldreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
 
-		if (nilfs_btree_node_get_nchildren(node) >
-		    nilfs_btree_node_nchildren_min(node, btree)) {
+		if (nilfs_btree_node_get_nchildren(node) > ncmin) {
 			path[level].bp_op = nilfs_btree_do_delete;
 			stats->bs_nblocks++;
 			goto out;
 		}
 
-		parent = nilfs_btree_get_node(btree, path, level + 1);
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
 		pindex = path[level + 1].bp_index;
 
 		if (pindex > 0) {
 			/* left sibling */
-			sibptr = nilfs_btree_node_get_ptr(btree, parent,
-							  pindex - 1);
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
+							  ncmax);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(sib) >
-			    nilfs_btree_node_nchildren_min(sib, btree)) {
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_left;
 				stats->bs_nblocks++;
@@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		} else if (pindex <
 			   nilfs_btree_node_get_nchildren(parent) - 1) {
 			/* right sibling */
-			sibptr = nilfs_btree_node_get_ptr(btree, parent,
-							  pindex + 1);
+			sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
+							  ncmax);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(sib) >
-			    nilfs_btree_node_nchildren_min(sib, btree)) {
+			if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_right;
 				stats->bs_nblocks++;
@@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 	node = nilfs_btree_get_root(btree);
 	path[level].bp_oldreq.bpr_ptr =
-		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+		nilfs_btree_node_get_ptr(node, path[level].bp_index,
+					 NILFS_BTREE_ROOT_NCHILDREN_MAX);
 
-	ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq, dat);
+	ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 
@@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
+	nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
  err_out_child_node:
 	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
 		brelse(path[level].bp_sib_bh);
-		nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq, dat);
+		nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
 	}
 	*levelp = level;
 	stats->bs_nblocks = 0;
 	return ret;
 }
 
-static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
 				      struct nilfs_btree_path *path,
 				      int maxlevel, struct inode *dat)
 {
 	int level;
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-		nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-					  &path[level].bp_oldreq, dat);
+		nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
 		path[level].bp_op(btree, path, level, NULL, NULL);
 	}
 
-	if (!nilfs_bmap_dirty(&btree->bt_bmap))
-		nilfs_bmap_set_dirty(&btree->bt_bmap);
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
 }
 
-static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
 
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_bmap_stats stats;
 	struct inode *dat;
 	int level, ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
 
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-				    NILFS_BTREE_LEVEL_NODE_MIN);
+				    NILFS_BTREE_LEVEL_NODE_MIN, 0);
 	if (ret < 0)
 		goto out;
 
 
-	dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
-		nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
 
 	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
 	if (ret < 0)
 		goto out;
 	nilfs_btree_commit_delete(btree, path, level, dat);
-	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+	nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
 
 out:
 	nilfs_btree_free_path(path);
 	return ret;
 }
 
-static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
@@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 	return ret;
 }
 
-static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree;
 	struct nilfs_btree_node *root, *node;
 	__u64 maxkey, nextmaxkey;
 	__u64 ptr;
 	int nchildren, ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	root = nilfs_btree_get_root(btree);
 	switch (nilfs_btree_height(btree)) {
 	case 2:
@@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		nchildren = nilfs_btree_node_get_nchildren(root);
 		if (nchildren > 1)
 			return 0;
-		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
 		ret = nilfs_btree_get_block(btree, ptr, &bh);
 		if (ret < 0)
 			return ret;
@@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 	return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
 
-static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
 				   __u64 *keys, __u64 *ptrs, int nitems)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree;
 	struct nilfs_btree_node *node, *root;
 	__le64 *dkeys;
 	__le64 *dptrs;
 	__u64 ptr;
-	int nchildren, i, ret;
+	int nchildren, ncmax, i, ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	root = nilfs_btree_get_root(btree);
 	switch (nilfs_btree_height(btree)) {
 	case 2:
 		bh = NULL;
 		node = root;
+		ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
 		break;
 	case 3:
 		nchildren = nilfs_btree_node_get_nchildren(root);
 		WARN_ON(nchildren > 1);
-		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+					       NILFS_BTREE_ROOT_NCHILDREN_MAX);
 		ret = nilfs_btree_get_block(btree, ptr, &bh);
 		if (ret < 0)
 			return ret;
 		node = (struct nilfs_btree_node *)bh->b_data;
+		ncmax = nilfs_btree_nchildren_per_block(btree);
 		break;
 	default:
 		node = NULL;
@@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 	if (nchildren < nitems)
 		nitems = nchildren;
 	dkeys = nilfs_btree_node_dkeys(node);
-	dptrs = nilfs_btree_node_dptrs(node, btree);
+	dptrs = nilfs_btree_node_dptrs(node, ncmax);
 	for (i = 0; i < nitems; i++) {
-		keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
-		ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+		keys[i] = le64_to_cpu(dkeys[i]);
+		ptrs[i] = le64_to_cpu(dptrs[i]);
 	}
 
 	if (bh != NULL)
@@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 }
 
 static int
-nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
 				       union nilfs_bmap_ptr_req *dreq,
 				       union nilfs_bmap_ptr_req *nreq,
 				       struct buffer_head **bhp,
 				       struct nilfs_bmap_stats *stats)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
 	struct inode *dat = NULL;
 	int ret;
 
@@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* for data */
 	/* cannot find near ptr */
-	if (NILFS_BMAP_USE_VBN(bmap)) {
+	if (NILFS_BMAP_USE_VBN(btree)) {
 		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-		dat = nilfs_bmap_get_dat(bmap);
+		dat = nilfs_bmap_get_dat(btree);
 	}
 
-	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
+	ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
 	if (ret < 0)
 		return ret;
 
@@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 	stats->bs_nblocks++;
 	if (nreq != NULL) {
 		nreq->bpr_ptr = dreq->bpr_ptr + 1;
-		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
+		ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
 		if (ret < 0)
 			goto err_out_dreq;
 
@@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* error */
  err_out_nreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
+	nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
  err_out_dreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
+	nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
 	stats->bs_nblocks = 0;
 	return ret;
 
 }
 
 static void
-nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
 				      __u64 key, __u64 ptr,
 				      const __u64 *keys, const __u64 *ptrs,
 				      int n,
@@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 				      union nilfs_bmap_ptr_req *nreq,
 				      struct buffer_head *bh)
 {
-	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
 	struct nilfs_btree_node *node;
 	struct inode *dat;
 	__u64 tmpptr;
+	int ncblk;
 
 	/* free resources */
-	if (bmap->b_ops->bop_clear != NULL)
-		bmap->b_ops->bop_clear(bmap);
+	if (btree->b_ops->bop_clear != NULL)
+		btree->b_ops->bop_clear(btree);
 
 	/* ptr must be a pointer to a buffer head. */
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 
 	/* convert and insert */
-	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-	nilfs_btree_init(bmap);
+	dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
+	nilfs_btree_init(btree);
 	if (nreq != NULL) {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
-		nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
 
 		/* create child node at level 1 */
 		node = (struct nilfs_btree_node *)bh->b_data;
-		nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
-		nilfs_btree_node_insert(btree, node,
-					key, dreq->bpr_ptr, n);
+		ncblk = nilfs_btree_nchildren_per_block(btree);
+		nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
 		if (!buffer_dirty(bh))
 			nilfs_btnode_mark_dirty(bh);
-		if (!nilfs_bmap_dirty(bmap))
-			nilfs_bmap_set_dirty(bmap);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
 
 		brelse(bh);
 
 		/* create root node at level 2 */
 		node = nilfs_btree_get_root(btree);
 		tmpptr = nreq->bpr_ptr;
-		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
-				      2, 1, &keys[0], &tmpptr);
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      &keys[0], &tmpptr);
 	} else {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
 
 		/* create root node at level 1 */
 		node = nilfs_btree_get_root(btree);
-		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
-				      1, n, keys, ptrs);
-		nilfs_btree_node_insert(btree, node,
-					key, dreq->bpr_ptr, n);
-		if (!nilfs_bmap_dirty(bmap))
-			nilfs_bmap_set_dirty(bmap);
+		nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
+				      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+				      keys, ptrs);
+		nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
+					NILFS_BTREE_ROOT_NCHILDREN_MAX);
+		if (!nilfs_bmap_dirty(btree))
+			nilfs_bmap_set_dirty(btree);
 	}
 
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
+	if (NILFS_BMAP_USE_VBN(btree))
+		nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
 }
 
 /**
@@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
  * @ptrs:
  * @n:
  */
-int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
 				   __u64 key, __u64 ptr,
 				   const __u64 *keys, const __u64 *ptrs, int n)
 {
@@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
 		di = &dreq;
 		ni = NULL;
 	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
-			   1 << bmap->b_inode->i_blkbits)) {
+			   1 << btree->b_inode->i_blkbits)) {
 		di = &dreq;
 		ni = &nreq;
 	} else {
@@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
 		BUG();
 	}
 
-	ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+	ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
 						     &stats);
 	if (ret < 0)
 		return ret;
-	nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+	nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
 					      di, ni, bh);
-	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
 	return 0;
 }
 
-static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
 				   struct nilfs_btree_path *path,
 				   int level,
 				   struct buffer_head *bh)
@@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
 	return 0;
 }
 
-static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
 					struct nilfs_btree_path *path,
 					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
-	int ret;
+	int ncmax, ret;
 
-	parent = nilfs_btree_get_node(btree, path, level + 1);
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
 	path[level].bp_oldreq.bpr_ptr =
-		nilfs_btree_node_get_ptr(btree, parent,
-					 path[level + 1].bp_index);
+		nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+					 ncmax);
 	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
 	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
 				       &path[level].bp_newreq.bpr_req);
@@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
 		path[level].bp_ctxt.bh = path[level].bp_bh;
 		ret = nilfs_btnode_prepare_change_key(
-			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		if (ret < 0) {
 			nilfs_dat_abort_update(dat,
@@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 	return 0;
 }
 
-static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
 					struct nilfs_btree_path *path,
 					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
+	int ncmax;
 
 	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
 				&path[level].bp_newreq.bpr_req,
-				btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
+				btree->b_ptr_type == NILFS_BMAP_PTR_VS);
 
 	if (buffer_nilfs_node(path[level].bp_bh)) {
 		nilfs_btnode_commit_change_key(
-			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		path[level].bp_bh = path[level].bp_ctxt.bh;
 	}
 	set_buffer_nilfs_volatile(path[level].bp_bh);
 
-	parent = nilfs_btree_get_node(btree, path, level + 1);
-	nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
-				 path[level].bp_newreq.bpr_ptr);
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
+				 path[level].bp_newreq.bpr_ptr, ncmax);
 }
 
-static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
 				       struct nilfs_btree_path *path,
 				       int level, struct inode *dat)
 {
@@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 			       &path[level].bp_newreq.bpr_req);
 	if (buffer_nilfs_node(path[level].bp_bh))
 		nilfs_btnode_abort_change_key(
-			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
 			&path[level].bp_ctxt);
 }
 
-static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
 					   struct nilfs_btree_path *path,
 					   int minlevel, int *maxlevelp,
 					   struct inode *dat)
@@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 	return ret;
 }
 
-static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
 					   struct nilfs_btree_path *path,
 					   int minlevel, int maxlevel,
 					   struct buffer_head *bh,
@@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
 		nilfs_btree_commit_update_v(btree, path, level, dat);
 }
 
-static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
 				   struct nilfs_btree_path *path,
 				   int level, struct buffer_head *bh)
 {
 	int maxlevel = 0, ret;
 	struct nilfs_btree_node *parent;
-	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	struct inode *dat = nilfs_bmap_get_dat(btree);
 	__u64 ptr;
+	int ncmax;
 
 	get_bh(bh);
 	path[level].bp_bh = bh;
@@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 		goto out;
 
 	if (buffer_nilfs_volatile(path[level].bp_bh)) {
-		parent = nilfs_btree_get_node(btree, path, level + 1);
-		ptr = nilfs_btree_node_get_ptr(btree, parent,
-					       path[level + 1].bp_index);
+		parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+		ptr = nilfs_btree_node_get_ptr(parent,
+					       path[level + 1].bp_index,
+					       ncmax);
 		ret = nilfs_dat_mark_dirty(dat, ptr);
 		if (ret < 0)
 			goto out;
@@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 	return ret;
 }
 
-static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate(struct nilfs_bmap *btree,
 				 struct buffer_head *bh)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_btree_node *node;
 	__u64 key;
@@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 
 	WARN_ON(!buffer_dirty(bh));
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
@@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		key = nilfs_btree_node_get_key(node, 0);
 		level = nilfs_btree_node_get_level(node);
 	} else {
-		key = nilfs_bmap_data_get_key(bmap, bh);
+		key = nilfs_bmap_data_get_key(btree, bh);
 		level = NILFS_BTREE_LEVEL_DATA;
 	}
 
-	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
 	if (ret < 0) {
 		if (unlikely(ret == -ENOENT))
 			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		goto out;
 	}
 
-	ret = NILFS_BMAP_USE_VBN(bmap) ?
+	ret = NILFS_BMAP_USE_VBN(btree) ?
 		nilfs_btree_propagate_v(btree, path, level, bh) :
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
@@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 	return ret;
 }
 
-static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
 				    struct buffer_head *bh)
 {
-	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
 }
 
-static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
 					 struct list_head *lists,
 					 struct buffer_head *bh)
 {
@@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
 	node = (struct nilfs_btree_node *)bh->b_data;
 	key = nilfs_btree_node_get_key(node, 0);
 	level = nilfs_btree_node_get_level(node);
+	if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
+	    level >= NILFS_BTREE_LEVEL_MAX) {
+		dump_stack();
+		printk(KERN_WARNING
+		       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
+		       "blocknr=%llu)\n",
+		       __func__, level, (unsigned long long)key,
+		       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
+		       (unsigned long long)bh->b_blocknr);
+		return;
+	}
+
 	list_for_each(head, &lists[level]) {
 		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
 		cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
 	list_add_tail(&bh->b_assoc_buffers, head);
 }
 
-static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
 					     struct list_head *listp)
 {
-	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
-	struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
+	struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
 	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
 	struct pagevec pvec;
 	struct buffer_head *bh, *head;
@@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
 		list_splice_tail(&lists[level], listp);
 }
 
-static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
 				struct nilfs_btree_path *path,
 				int level,
 				struct buffer_head **bh,
@@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
 	struct nilfs_btree_node *parent;
 	__u64 key;
 	__u64 ptr;
-	int ret;
+	int ncmax, ret;
 
-	parent = nilfs_btree_get_node(btree, path, level + 1);
-	ptr = nilfs_btree_node_get_ptr(btree, parent,
-				       path[level + 1].bp_index);
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
 	if (buffer_nilfs_node(*bh)) {
 		path[level].bp_ctxt.oldkey = ptr;
 		path[level].bp_ctxt.newkey = blocknr;
 		path[level].bp_ctxt.bh = *bh;
 		ret = nilfs_btnode_prepare_change_key(
-			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		if (ret < 0)
 			return ret;
 		nilfs_btnode_commit_change_key(
-			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&NILFS_BMAP_I(btree)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		*bh = path[level].bp_ctxt.bh;
 	}
 
-	nilfs_btree_node_set_ptr(btree, parent,
-				 path[level + 1].bp_index, blocknr);
+	nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
+				 ncmax);
 
 	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
-	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
 	binfo->bi_dat.bi_level = level;
 
 	return 0;
 }
 
-static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
 				struct nilfs_btree_path *path,
 				int level,
 				struct buffer_head **bh,
@@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 				union nilfs_binfo *binfo)
 {
 	struct nilfs_btree_node *parent;
-	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	struct inode *dat = nilfs_bmap_get_dat(btree);
 	__u64 key;
 	__u64 ptr;
 	union nilfs_bmap_ptr_req req;
-	int ret;
+	int ncmax, ret;
 
-	parent = nilfs_btree_get_node(btree, path, level + 1);
-	ptr = nilfs_btree_node_get_ptr(btree, parent,
-				       path[level + 1].bp_index);
+	parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
+	ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
+				       ncmax);
 	req.bpr_ptr = ptr;
 	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
 	if (ret < 0)
@@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 
 	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
-	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
 
 	return 0;
 }
 
-static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign(struct nilfs_bmap *btree,
 			      struct buffer_head **bh,
 			      sector_t blocknr,
 			      union nilfs_binfo *binfo)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_btree_node *node;
 	__u64 key;
 	int level, ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
@@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		key = nilfs_btree_node_get_key(node, 0);
 		level = nilfs_btree_node_get_level(node);
 	} else {
-		key = nilfs_bmap_data_get_key(bmap, *bh);
+		key = nilfs_bmap_data_get_key(btree, *bh);
 		level = NILFS_BTREE_LEVEL_DATA;
 	}
 
-	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
 	if (ret < 0) {
 		WARN_ON(ret == -ENOENT);
 		goto out;
 	}
 
-	ret = NILFS_BMAP_USE_VBN(bmap) ?
+	ret = NILFS_BMAP_USE_VBN(btree) ?
 		nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
@@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 	return ret;
 }
 
-static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
 				 struct buffer_head **bh,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
@@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 	__u64 key;
 	int ret;
 
-	ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+	ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
 			     blocknr);
 	if (ret < 0)
 		return ret;
@@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
 		key = nilfs_btree_node_get_key(node, 0);
 	} else
-		key = nilfs_bmap_data_get_key(bmap, *bh);
+		key = nilfs_bmap_data_get_key(btree, *bh);
 
 	/* on-disk format */
 	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
-	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_v.bi_blkoff = cpu_to_le64(key);
 
 	return 0;
 }
 
-static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	__u64 ptr;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
 
-	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
 	if (ret < 0) {
 		WARN_ON(ret == -ENOENT);
 		goto out;
@@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 	if (!buffer_dirty(bh))
 		nilfs_btnode_mark_dirty(bh);
 	brelse(bh);
-	if (!nilfs_bmap_dirty(&btree->bt_bmap))
-		nilfs_bmap_set_dirty(&btree->bt_bmap);
+	if (!nilfs_bmap_dirty(btree))
+		nilfs_bmap_set_dirty(btree);
 
  out:
 	nilfs_btree_free_path(path);
@@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 int nilfs_btree_init(struct nilfs_bmap *bmap)
 {
 	bmap->b_ops = &nilfs_btree_ops;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
 	return 0;
 }
 
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
 	bmap->b_ops = &nilfs_btree_ops_gc;
+	bmap->b_nchildren_per_block =
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 43c8c5b541f..22c02e35b6e 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -31,14 +31,6 @@
 #include "bmap.h"
 
 /**
- * struct nilfs_btree - B-tree structure
- * @bt_bmap: bmap base structure
- */
-struct nilfs_btree {
-	struct nilfs_bmap bt_bmap;
-};
-
-/**
  * struct nilfs_btree_path - A path on which B-tree operations are executed
  * @bp_bh: buffer head of node block
  * @bp_sib_bh: buffer head of sibling node block
@@ -54,7 +46,7 @@ struct nilfs_btree_path {
 	union nilfs_bmap_ptr_req bp_oldreq;
 	union nilfs_bmap_ptr_req bp_newreq;
 	struct nilfs_btnode_chkey_ctxt bp_ctxt;
-	void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+	void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
 		      int, __u64 *, __u64 *);
 };
 
@@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
 				   const __u64 *, const __u64 *, int);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
 
+int nilfs_btree_broken_node_block(struct buffer_head *bh);
+
 #endif	/* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db6..5ff15a8a102 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
  */
 int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 {
-	struct the_nilfs *nilfs;
 	int ret;
 
-	nilfs = NILFS_MDT(cpfile)->mi_nilfs;
-
 	switch (mode) {
 	case NILFS_CHECKPOINT:
-		/*
-		 * Check for protecting existing snapshot mounts:
-		 * ns_mount_mutex is used to make this operation atomic and
-		 * exclusive with a new mount job.  Though it doesn't cover
-		 * umount, it's enough for the purpose.
-		 */
-		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
-			/* Current implementation does not have to protect
-			   plain read-only mounts since they are exclusive
-			   with a read/write mount and are protected from the
-			   cleaner. */
+		if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
+			/*
+			 * Current implementation does not have to protect
+			 * plain read-only mounts since they are exclusive
+			 * with a read/write mount and are protected from the
+			 * cleaner.
+			 */
 			ret = -EBUSY;
-		} else
+		else
 			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
 		return ret;
 	case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 }
 
 /**
- * nilfs_cpfile_read - read cpfile inode
- * @cpfile: cpfile inode
- * @raw_inode: on-disk cpfile inode
- */
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
-{
-	return nilfs_read_inode_common(cpfile, raw_inode);
-}
-
-/**
- * nilfs_cpfile_new - create cpfile
- * @nilfs: nilfs object
+ * nilfs_cpfile_read - read or get cpfile inode
+ * @sb: super block instance
  * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
 	struct inode *cpfile;
+	int err;
+
+	cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+	if (unlikely(!cpfile))
+		return -ENOMEM;
+	if (!(cpfile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+	if (err)
+		goto failed;
 
-	cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
-	if (cpfile)
-		nilfs_mdt_set_entry_size(cpfile, cpsize,
-					 sizeof(struct nilfs_cpfile_header));
-	return cpfile;
+	nilfs_mdt_set_entry_size(cpfile, cpsize,
+				 sizeof(struct nilfs_cpfile_header));
+
+	err = nilfs_read_inode_common(cpfile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(cpfile);
+ out:
+	*inodep = cpfile;
+	return 0;
+ failed:
+	iget_failed(cpfile);
+	return err;
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab4..a242b9a314f 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
 				size_t);
 
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
 
 #endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 01314675568..49c844dab33 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
 struct nilfs_dat_info {
 	struct nilfs_mdt_info mi;
 	struct nilfs_palloc_cache palloc_cache;
+	struct nilfs_shadow_map shadow;
 };
 
 static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_palloc_abort_alloc_entry(dat, req);
 }
 
-void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+static void nilfs_dat_commit_free(struct inode *dat,
+				  struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
 	if (ret < 0)
 		return ret;
+
+	/*
+	 * The given disk block number (blocknr) is not yet written to
+	 * the device at this point.
+	 *
+	 * To prevent nilfs_dat_translate() from returning the
+	 * uncommited block number, this makes a copy of the entry
+	 * buffer and redirects nilfs_dat_translate() to the copy.
+	 */
+	if (!buffer_nilfs_redirected(entry_bh)) {
+		ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+		if (ret) {
+			brelse(entry_bh);
+			return ret;
+		}
+	}
+
 	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
  */
 int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 {
-	struct buffer_head *entry_bh;
+	struct buffer_head *entry_bh, *bh;
 	struct nilfs_dat_entry *entry;
 	sector_t blocknr;
 	void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	if (ret < 0)
 		return ret;
 
+	if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+		bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+		if (bh) {
+			WARN_ON(!buffer_uptodate(bh));
+			brelse(entry_bh);
+			entry_bh = bh;
+		}
+	}
+
 	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 }
 
 /**
- * nilfs_dat_read - read dat inode
- * @dat: dat inode
- * @raw_inode: on-disk dat inode
- */
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
-{
-	return nilfs_read_inode_common(dat, raw_inode);
-}
-
-/**
- * nilfs_dat_new - create dat file
- * @nilfs: nilfs object
+ * nilfs_dat_read - read or get dat inode
+ * @sb: super block instance
  * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep)
 {
 	static struct lock_class_key dat_lock_key;
 	struct inode *dat;
 	struct nilfs_dat_info *di;
 	int err;
 
-	dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
-	if (dat) {
-		err = nilfs_palloc_init_blockgroup(dat, entry_size);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(dat);
-			return NULL;
-		}
+	dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
+	if (unlikely(!dat))
+		return -ENOMEM;
+	if (!(dat->i_state & I_NEW))
+		goto out;
 
-		di = NILFS_DAT_I(dat);
-		lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
-		nilfs_palloc_setup_cache(dat, &di->palloc_cache);
-	}
-	return dat;
+	err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(dat, entry_size);
+	if (err)
+		goto failed;
+
+	di = NILFS_DAT_I(dat);
+	lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+	nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+
+	err = nilfs_read_inode_common(dat, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(dat);
+ out:
+	*inodep = dat;
+	return 0;
+ failed:
+	iget_failed(dat);
+	return err;
 }
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0ef..cbd8e973250 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+		   struct nilfs_inode *raw_inode, struct inode **inodep);
 
 #endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f..cb003c8ee1f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -80,23 +80,10 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static int nilfs_prepare_chunk_uninterruptible(struct page *page,
-					       struct address_space *mapping,
-					       unsigned from, unsigned to)
+static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
 {
 	loff_t pos = page_offset(page) + from;
-	return block_write_begin(NULL, mapping, pos, to - from,
-				 AOP_FLAG_UNINTERRUPTIBLE, &page,
-				 NULL, nilfs_get_block);
-}
-
-static int nilfs_prepare_chunk(struct page *page,
-			       struct address_space *mapping,
-			       unsigned from, unsigned to)
-{
-	loff_t pos = page_offset(page) + from;
-	return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
-				 NULL, nilfs_get_block);
+	return __block_write_begin(page, pos, to - from, nilfs_get_block);
 }
 
 static void nilfs_commit_chunk(struct page *page,
@@ -141,7 +128,7 @@ static void nilfs_check_page(struct page *page)
 	}
 	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
 		p = (struct nilfs_dir_entry *)(kaddr + offs);
-		rec_len = le16_to_cpu(p->rec_len);
+		rec_len = nilfs_rec_len_from_disk(p->rec_len);
 
 		if (rec_len < NILFS_DIR_REC_LEN(1))
 			goto Eshort;
@@ -199,13 +186,10 @@ fail:
 static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t *)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			nilfs_check_page(page);
 		if (PageError(page))
@@ -238,7 +222,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
  */
 static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
 {
-	return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+	return (struct nilfs_dir_entry *)((char *)p +
+					  nilfs_rec_len_from_disk(p->rec_len));
 }
 
 static unsigned char
@@ -329,7 +314,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 					goto success;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
 		}
 		nilfs_put_page(page);
 	}
@@ -444,12 +429,12 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct page *page, struct inode *inode)
 {
 	unsigned from = (char *) de - (char *) page_address(page);
-	unsigned to = from + le16_to_cpu(de->rec_len);
+	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
 	struct address_space *mapping = page->mapping;
 	int err;
 
 	lock_page(page);
-	err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+	err = nilfs_prepare_chunk(page, from, to);
 	BUG_ON(err);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
@@ -500,7 +485,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 				/* We hit i_size */
 				name_len = 0;
 				rec_len = chunk_size;
-				de->rec_len = cpu_to_le16(chunk_size);
+				de->rec_len = nilfs_rec_len_to_disk(chunk_size);
 				de->inode = 0;
 				goto got_it;
 			}
@@ -514,7 +499,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 			if (nilfs_match(namelen, name, de))
 				goto out_unlock;
 			name_len = NILFS_DIR_REC_LEN(de->name_len);
-			rec_len = le16_to_cpu(de->rec_len);
+			rec_len = nilfs_rec_len_from_disk(de->rec_len);
 			if (!de->inode && rec_len >= reclen)
 				goto got_it;
 			if (rec_len >= name_len + reclen)
@@ -530,15 +515,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 got_it:
 	from = (char *)de - (char *)page_address(page);
 	to = from + rec_len;
-	err = nilfs_prepare_chunk(page, page->mapping, from, to);
+	err = nilfs_prepare_chunk(page, from, to);
 	if (err)
 		goto out_unlock;
 	if (de->inode) {
 		struct nilfs_dir_entry *de1;
 
 		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
-		de1->rec_len = cpu_to_le16(rec_len - name_len);
-		de->rec_len = cpu_to_le16(name_len);
+		de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = nilfs_rec_len_to_disk(name_len);
 		de = de1;
 	}
 	de->name_len = namelen;
@@ -569,7 +554,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	struct inode *inode = mapping->host;
 	char *kaddr = page_address(page);
 	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	unsigned to = ((char *)dir - kaddr) +
+		nilfs_rec_len_from_disk(dir->rec_len);
 	struct nilfs_dir_entry *pde = NULL;
 	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
 	int err;
@@ -587,10 +573,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	if (pde)
 		from = (char *)pde - (char *)page_address(page);
 	lock_page(page);
-	err = nilfs_prepare_chunk(page, mapping, from, to);
+	err = nilfs_prepare_chunk(page, from, to);
 	BUG_ON(err);
 	if (pde)
-		pde->rec_len = cpu_to_le16(to - from);
+		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -615,7 +601,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	if (!page)
 		return -ENOMEM;
 
-	err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+	err = nilfs_prepare_chunk(page, 0, chunk_size);
 	if (unlikely(err)) {
 		unlock_page(page);
 		goto fail;
@@ -624,14 +610,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	memset(kaddr, 0, chunk_size);
 	de = (struct nilfs_dir_entry *)kaddr;
 	de->name_len = 1;
-	de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
 	memcpy(de->name, ".\0\0", 4);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
 
 	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
 	de->name_len = 2;
-	de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
 	de->inode = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, "..\0", 4);
 	nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cd..324d80c5751 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
 #include "alloc.h"
 #include "dat.h"
 
-static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
 {
 	return (__le64 *)
-		((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+		((struct nilfs_direct_node *)direct->b_u.u_data + 1);
 }
 
 static inline __u64
-nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
 {
-	return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+	return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
 }
 
-static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
 					__u64 key, __u64 ptr)
 {
-	*(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+	*(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
 }
 
-static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
 			       __u64 key, int level, __u64 *ptrp)
 {
-	struct nilfs_direct *direct;
 	__u64 ptr;
 
-	direct = (struct nilfs_direct *)bmap;  /* XXX: use macro for level 1 */
 	if (key > NILFS_DIRECT_KEY_MAX || level != 1)
 		return -ENOENT;
 	ptr = nilfs_direct_get_ptr(direct, key);
 	if (ptr == NILFS_BMAP_INVALID_PTR)
 		return -ENOENT;
 
-	if (ptrp != NULL)
-		*ptrp = ptr;
+	*ptrp = ptr;
 	return 0;
 }
 
-static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 				      __u64 key, __u64 *ptrp,
 				      unsigned maxblocks)
 {
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	struct inode *dat = NULL;
 	__u64 ptr, ptr2;
 	sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
 	if (ptr == NILFS_BMAP_INVALID_PTR)
 		return -ENOENT;
 
-	if (NILFS_BMAP_USE_VBN(bmap)) {
-		dat = nilfs_bmap_get_dat(bmap);
+	if (NILFS_BMAP_USE_VBN(direct)) {
+		dat = nilfs_bmap_get_dat(direct);
 		ret = nilfs_dat_translate(dat, ptr, &blocknr);
 		if (ret < 0)
 			return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
 }
 
 static __u64
-nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
 {
 	__u64 ptr;
 
-	ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+	ptr = nilfs_bmap_find_target_seq(direct, key);
 	if (ptr != NILFS_BMAP_INVALID_PTR)
 		/* sequential access */
 		return ptr;
 	else
 		/* block group */
-		return nilfs_bmap_find_target_in_group(&direct->d_bmap);
-}
-
-static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
-				      __u64 key, __u64 ptr)
-{
-	direct->d_bmap.b_last_allocated_key = key;
-	direct->d_bmap.b_last_allocated_ptr = ptr;
+		return nilfs_bmap_find_target_in_group(direct);
 }
 
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
 	struct inode *dat = NULL;
 	struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 
 	if (key > NILFS_DIRECT_KEY_MAX)
 		return -ENOENT;
-	if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+	if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
 		return -EEXIST;
 
 	if (NILFS_BMAP_USE_VBN(bmap)) {
-		req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+		req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
 		dat = nilfs_bmap_get_dat(bmap);
 	}
 	ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 		set_buffer_nilfs_volatile(bh);
 
 		nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
-		nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
+		nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
 
 		if (!nilfs_bmap_dirty(bmap))
 			nilfs_bmap_set_dirty(bmap);
 
 		if (NILFS_BMAP_USE_VBN(bmap))
-			nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
+			nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
 
 		nilfs_bmap_add_blocks(bmap, 1);
 	}
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
 	struct inode *dat;
 	int ret;
 
 	if (key > NILFS_DIRECT_KEY_MAX ||
-	    nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+	    nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
 		return -ENOENT;
 
 	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-	req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
+	req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
 
 	ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
 	if (!ret) {
 		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
-		nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
 		nilfs_bmap_sub_blocks(bmap, 1);
 	}
 	return ret;
 }
 
-static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
 {
-	struct nilfs_direct *direct;
 	__u64 key, lastkey;
 
-	direct = (struct nilfs_direct *)bmap;
 	lastkey = NILFS_DIRECT_KEY_MAX + 1;
 	for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
 		if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
 	return key > NILFS_DIRECT_KEY_MAX;
 }
 
-static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
 				    __u64 *keys, __u64 *ptrs, int nitems)
 {
-	struct nilfs_direct *direct;
 	__u64 key;
 	__u64 ptr;
 	int n;
 
-	direct = (struct nilfs_direct *)bmap;
 	if (nitems > NILFS_DIRECT_NBLOCKS)
 		nitems = NILFS_DIRECT_NBLOCKS;
 	n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 				    __u64 key, __u64 *keys, __u64 *ptrs, int n)
 {
-	struct nilfs_direct *direct;
 	__le64 *dptrs;
 	int ret, i, j;
 
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 		bmap->b_ops->bop_clear(bmap);
 
 	/* convert */
-	direct = (struct nilfs_direct *)bmap;
-	dptrs = nilfs_direct_dptrs(direct);
+	dptrs = nilfs_direct_dptrs(bmap);
 	for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
 		if ((j < n) && (i == keys[j])) {
 			dptrs[i] = (i != key) ?
-				nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+				cpu_to_le64(ptrs[j]) :
 				NILFS_BMAP_INVALID_PTR;
 			j++;
 		} else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 	return 0;
 }
 
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
 				  struct buffer_head *bh)
 {
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	struct nilfs_palloc_req oldreq, newreq;
 	struct inode *dat;
 	__u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
 
 	dat = nilfs_bmap_get_dat(bmap);
 	key = nilfs_bmap_data_get_key(bmap, bh);
-	ptr = nilfs_direct_get_ptr(direct, key);
+	ptr = nilfs_direct_get_ptr(bmap, key);
 	if (!buffer_nilfs_volatile(bh)) {
 		oldreq.pr_entry_nr = ptr;
 		newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
 		nilfs_dat_commit_update(dat, &oldreq, &newreq,
 					bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 		set_buffer_nilfs_volatile(bh);
-		nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
+		nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
 	} else
 		ret = nilfs_dat_mark_dirty(dat, ptr);
 
 	return ret;
 }
 
-static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
 				 __u64 key, __u64 ptr,
 				 struct buffer_head **bh,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
 {
-	struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
+	struct inode *dat = nilfs_bmap_get_dat(direct);
 	union nilfs_bmap_ptr_req req;
 	int ret;
 
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
 	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
 	if (!ret) {
 		nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
-		binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-		binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+		binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
+		binfo->bi_v.bi_blkoff = cpu_to_le64(key);
 	}
 	return ret;
 }
 
-static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
 				 __u64 key, __u64 ptr,
 				 struct buffer_head **bh,
 				 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
 {
 	nilfs_direct_set_ptr(direct, key, blocknr);
 
-	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
 	binfo->bi_dat.bi_level = 0;
 
 	return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
 			       sector_t blocknr,
 			       union nilfs_binfo *binfo)
 {
-	struct nilfs_direct *direct;
 	__u64 key;
 	__u64 ptr;
 
-	direct = (struct nilfs_direct *)bmap;
 	key = nilfs_bmap_data_get_key(bmap, *bh);
 	if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
 		printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
 		       (unsigned long long)key);
 		return -EINVAL;
 	}
-	ptr = nilfs_direct_get_ptr(direct, key);
+	ptr = nilfs_direct_get_ptr(bmap, key);
 	if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
 		printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
 		       (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
 	}
 
 	return NILFS_BMAP_USE_VBN(bmap) ?
-		nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
-		nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
+		nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
+		nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
 }
 
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d..dc643de20a2 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
 #include "bmap.h"
 
 
-struct nilfs_direct;
-
 /**
  * struct nilfs_direct_node - direct node
  * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
 	__u8 pad[7];
 };
 
-/**
- * struct nilfs_direct - direct mapping
- * @d_bmap: bmap structure
- */
-struct nilfs_direct {
-	struct nilfs_bmap d_bmap;
-};
-
-
 #define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
 #define NILFS_DIRECT_KEY_MIN	0
 #define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 00000000000..a71cc412b65
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations nilfs_export_ops;
+
+struct nilfs_fid {
+	u64 cno;
+	u64 ino;
+	u32 gen;
+
+	u32 parent_gen;
+	u64 parent_ino;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index dd5f7e0a95f..00000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * gcdat.c - NILFS shadow DAT inode for GC
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- *
- */
-
-#include <linux/buffer_head.h>
-#include "nilfs.h"
-#include "page.h"
-#include "mdt.h"
-
-int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-	int err;
-
-	gcdat->i_state = 0;
-	gcdat->i_blocks = dat->i_blocks;
-	gii->i_flags = dii->i_flags;
-	gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
-	gii->i_cno = 0;
-	nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
-	err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
-	if (unlikely(err))
-		return err;
-
-	return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
-				      &dii->i_btnode_cache);
-}
-
-void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-	struct address_space *mapping = dat->i_mapping;
-	struct address_space *gmapping = gcdat->i_mapping;
-
-	down_write(&NILFS_MDT(dat)->mi_sem);
-	dat->i_blocks = gcdat->i_blocks;
-	dii->i_flags = gii->i_flags;
-	dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
-
-	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
-
-	nilfs_palloc_clear_cache(dat);
-	nilfs_palloc_clear_cache(gcdat);
-	nilfs_clear_dirty_pages(mapping);
-	nilfs_copy_back_pages(mapping, gmapping);
-	/* note: mdt dirty flags should be cleared by segctor. */
-
-	nilfs_clear_dirty_pages(&dii->i_btnode_cache);
-	nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
-
-	up_write(&NILFS_MDT(dat)->mi_sem);
-}
-
-void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
-{
-	struct inode *gcdat = nilfs->ns_gc_dat;
-	struct nilfs_inode_info *gii = NILFS_I(gcdat);
-
-	gcdat->i_state = I_CLEAR;
-	gii->i_flags = 0;
-
-	nilfs_palloc_clear_cache(gcdat);
-	truncate_inode_pages(gcdat->i_mapping, 0);
-	truncate_inode_pages(&gii->i_btnode_cache, 0);
-}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3..33ad25ddd5c 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
  * gcinodes), and this file provides lookup function of the dummy
  * inodes and their buffer read function.
  *
- * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
- * has to treat blocks that belong to a same file but have different
- * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separately from actual inodes, and their lookup
- * function (nilfs_gc_iget) is designed to be specified with a
- * checkpoint number argument as well as an inode number.
- *
  * Buffers and pages held by the dummy inodes will be released each
  * time after they are copied to a new log.  Dirty blocks made on the
  * current generation and the blocks to be moved by GC never overlap
@@ -48,6 +41,8 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "mdt.h"
 #include "dat.h"
@@ -149,8 +144,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
 				   __u64 vbn, struct buffer_head **out_bh)
 {
-	int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
-					    vbn ? : pbn, pbn, out_bh);
+	int ret;
+
+	ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+					vbn ? : pbn, pbn, READ, out_bh, &pbn);
 	if (ret == -EEXIST) /* internal code (cache hit) */
 		ret = 0;
 	return ret;
@@ -164,127 +161,53 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 	if (buffer_dirty(bh))
 		return -EEXIST;
 
-	if (buffer_nilfs_node(bh))
+	if (buffer_nilfs_node(bh)) {
+		if (nilfs_btree_broken_node_block(bh)) {
+			clear_buffer_uptodate(bh);
+			return -EIO;
+		}
 		nilfs_btnode_mark_dirty(bh);
-	else
-		nilfs_mdt_mark_buffer_dirty(bh);
-	return 0;
-}
-
-/*
- * nilfs_init_gccache() - allocate and initialize gc_inode hash table
- * @nilfs - the_nilfs
- *
- * Return Value: On success, 0.
- * On error, a negative error code is returned.
- */
-int nilfs_init_gccache(struct the_nilfs *nilfs)
-{
-	int loop;
-
-	BUG_ON(nilfs->ns_gc_inodes_h);
-
-	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-
-	nilfs->ns_gc_inodes_h =
-		kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
-			GFP_NOFS);
-	if (nilfs->ns_gc_inodes_h == NULL)
-		return -ENOMEM;
-
-	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
-		INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
-	return 0;
-}
-
-/*
- * nilfs_destroy_gccache() - free gc_inode hash table
- * @nilfs - the nilfs
- */
-void nilfs_destroy_gccache(struct the_nilfs *nilfs)
-{
-	if (nilfs->ns_gc_inodes_h) {
-		nilfs_remove_all_gcinode(nilfs);
-		kfree(nilfs->ns_gc_inodes_h);
-		nilfs->ns_gc_inodes_h = NULL;
+	} else {
+		nilfs_mark_buffer_dirty(bh);
 	}
+	return 0;
 }
 
-static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
-				   __u64 cno)
+int nilfs_init_gcinode(struct inode *inode)
 {
-	struct inode *inode;
-	struct nilfs_inode_info *ii;
-
-	inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
-	if (!inode)
-		return NULL;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 
-	inode->i_op = NULL;
-	inode->i_fop = NULL;
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 	inode->i_mapping->a_ops = &def_gcinode_aops;
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
-	ii = NILFS_I(inode);
-	ii->i_cno = cno;
 	ii->i_flags = 0;
-	ii->i_state = 1 << NILFS_I_GCINODE;
-	ii->i_bh = NULL;
 	nilfs_bmap_init_gc(ii->i_bmap);
 
-	return inode;
-}
-
-static unsigned long ihash(ino_t ino, __u64 cno)
-{
-	return hash_long((unsigned long)((ino << 2) + cno),
-			 NILFS_GCINODE_HASH_BITS);
-}
-
-/*
- * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
- */
-struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
-{
-	struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
-	struct hlist_node *node;
-	struct inode *inode;
-
-	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
-			return inode;
-	}
-
-	inode = alloc_gcinode(nilfs, ino, cno);
-	if (likely(inode)) {
-		hlist_add_head(&inode->i_hash, head);
-		list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-	}
-	return inode;
-}
+	/*
+	 * Add the inode to GC inode list. Garbage Collection
+	 * is serialized and no two processes manipulate the
+	 * list simultaneously.
+	 */
+	igrab(inode);
+	list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
 
-/*
- * nilfs_clear_gcinode() - clear and free a gc inode
- */
-void nilfs_clear_gcinode(struct inode *inode)
-{
-	nilfs_mdt_destroy(inode);
+	return 0;
 }
 
-/*
- * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+/**
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
  */
-void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
 {
-	struct hlist_head *head = nilfs->ns_gc_inodes_h;
-	struct hlist_node *node, *n;
-	struct inode *inode;
-	int loop;
+	struct list_head *head = &nilfs->ns_gc_inodes;
+	struct nilfs_inode_info *ii;
 
-	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
-		hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
-			hlist_del_init(&inode->i_hash);
-			list_del_init(&NILFS_I(inode)->i_dirty);
-			nilfs_clear_gcinode(inode); /* might sleep */
-		}
+	while (!list_empty(head)) {
+		ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
+		list_del_init(&ii->i_dirty);
+		iput(&ii->vfs_inode);
 	}
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8..9f8a2da67f9 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 }
 
 /**
- * nilfs_ifile_new - create inode file
- * @sbi: nilfs_sb_info struct
+ * nilfs_ifile_read - read or get ifile inode
+ * @sb: super block instance
+ * @root: root object
  * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
  */
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep)
 {
 	struct inode *ifile;
 	int err;
 
-	ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
-			      sizeof(struct nilfs_ifile_info));
-	if (ifile) {
-		err = nilfs_palloc_init_blockgroup(ifile, inode_size);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(ifile);
-			return NULL;
-		}
-		nilfs_palloc_setup_cache(ifile,
-					 &NILFS_IFILE_I(ifile)->palloc_cache);
-	}
-	return ifile;
+	ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
+	if (unlikely(!ifile))
+		return -ENOMEM;
+	if (!(ifile->i_state & I_NEW))
+		goto out;
+
+	err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
+			     sizeof(struct nilfs_ifile_info));
+	if (err)
+		goto failed;
+
+	err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+	if (err)
+		goto failed;
+
+	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+
+	err = nilfs_read_inode_common(ifile, raw_inode);
+	if (err)
+		goto failed;
+
+	unlock_new_inode(ifile);
+ out:
+	*inodep = ifile;
+	return 0;
+ failed:
+	iget_failed(ifile);
+	return err;
 }
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f..59b6f2b51df 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+		     size_t inode_size, struct nilfs_inode *raw_inode,
+		     struct inode **inodep);
 
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 39e038ac8fc..71d4bc8464e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -27,12 +27,19 @@
 #include <linux/writeback.h>
 #include <linux/uio.h>
 #include "nilfs.h"
+#include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
 #include "cpfile.h"
 #include "ifile.h"
 
+struct nilfs_iget_args {
+	u64 ino;
+	__u64 cno;
+	struct nilfs_root *root;
+	int for_gc;
+};
 
 /**
  * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -197,11 +204,15 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err))
 		return err;
 
-	*pagep = NULL;
-	err = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, nilfs_get_block);
-	if (unlikely(err))
+	err = block_write_begin(mapping, pos, len, flags, pagep,
+				nilfs_get_block);
+	if (unlikely(err)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+
 		nilfs_transaction_abort(inode->i_sb);
+	}
 	return err;
 }
 
@@ -237,6 +248,19 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	/* Needs synchronization with the cleaner */
 	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, nilfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && size < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
 	return size;
 }
 
@@ -261,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
+	struct nilfs_root *root;
 	int err = -ENOMEM;
 	ino_t ino;
 
@@ -271,15 +296,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 
+	root = NILFS_I(dir)->i_root;
 	ii = NILFS_I(inode);
 	ii->i_state = 1 << NILFS_I_NEW;
+	ii->i_root = root;
 
-	err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+	err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
 	if (unlikely(err))
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
-	atomic_inc(&sbi->s_inodes_count);
+	atomic_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -302,7 +329,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
 	ii->i_dir_start_lookup = 0;
-	ii->i_cno = 0;
 	nilfs_set_inode_flags(inode);
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -332,17 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	return ERR_PTR(err);
 }
 
-void nilfs_free_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-
-	clear_inode(inode);
-	/* XXX: check error code? Is there any thing I can do? */
-	(void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
-	atomic_dec(&sbi->s_inodes_count);
-}
-
 void nilfs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = NILFS_I(inode)->i_flags;
@@ -393,7 +408,6 @@ int nilfs_read_inode_common(struct inode *inode,
 		0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
 	ii->i_dir_start_lookup = 0;
-	ii->i_cno = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -407,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
 	return 0;
 }
 
-static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+static int __nilfs_read_inode(struct super_block *sb,
+			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -417,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 	int err;
 
 	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+	err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
 	if (unlikely(err))
 		goto bad_inode;
 
-	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+	raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
 
 	err = nilfs_read_inode_common(inode, raw_inode);
 	if (err)
@@ -444,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
 	nilfs_set_inode_flags(inode);
 	return 0;
 
  failed_unmap:
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
 	brelse(bh);
 
  bad_inode:
@@ -459,18 +474,95 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 	return err;
 }
 
-struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+	struct nilfs_inode_info *ii;
+
+	if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
+		return 0;
+
+	ii = NILFS_I(inode);
+	if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+		return !args->for_gc;
+
+	return args->for_gc && args->cno == ii->i_cno;
+}
+
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+	struct nilfs_iget_args *args = opaque;
+
+	inode->i_ino = args->ino;
+	if (args->for_gc) {
+		NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+		NILFS_I(inode)->i_cno = args->cno;
+		NILFS_I(inode)->i_root = NULL;
+	} else {
+		if (args->root && args->ino == NILFS_ROOT_INO)
+			nilfs_get_root(args->root);
+		NILFS_I(inode)->i_root = args->root;
+	}
+	return 0;
+}
+
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino)
+{
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = root, .cno = 0, .for_gc = 0
+	};
+
+	return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = nilfs_iget_locked(sb, root, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = __nilfs_read_inode(sb, root, ino, inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+				__u64 cno)
 {
+	struct nilfs_iget_args args = {
+		.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+	};
 	struct inode *inode;
 	int err;
 
-	inode = iget_locked(sb, ino);
+	inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
 	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	err = __nilfs_read_inode(sb, ino, inode);
+	err = nilfs_init_gcinode(inode);
 	if (unlikely(err)) {
 		iget_failed(inode);
 		return ERR_PTR(err);
@@ -511,21 +603,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
 	ino_t ino = inode->i_ino;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct super_block *sb = inode->i_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *ifile = ii->i_root->ifile;
 	struct nilfs_inode *raw_inode;
 
-	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+	raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
 
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
-		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
 
 	nilfs_write_inode_common(inode, raw_inode, 0);
 		/* XXX: call with has_bmap = 0 is a workaround to avoid
 		   deadlock of bmap. This delays update of i_bmap to just
 		   before writing */
-	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
@@ -597,16 +688,41 @@ void nilfs_truncate(struct inode *inode)
 	   But truncate has no return value. */
 }
 
-void nilfs_delete_inode(struct inode *inode)
+static void nilfs_clear_inode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	/*
+	 * Free resources allocated in nilfs_read_inode(), here.
+	 */
+	BUG_ON(!list_empty(&ii->i_dirty));
+	brelse(ii->i_bh);
+	ii->i_bh = NULL;
+
+	if (mdi && mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
+
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+
+	if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+		nilfs_put_root(ii->i_root);
+}
+
+void nilfs_evict_inode(struct inode *inode)
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
-	if (unlikely(is_bad_inode(inode))) {
+	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
-		clear_inode(inode);
+		end_writeback(inode);
+		nilfs_clear_inode(inode);
 		return;
 	}
 	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
@@ -614,10 +730,16 @@ void nilfs_delete_inode(struct inode *inode)
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
 
+	/* TODO: some of the following operations may fail.  */
 	nilfs_truncate_bmap(ii, 0);
 	nilfs_mark_inode_dirty(inode);
-	nilfs_free_inode(inode);
-	/* nilfs_free_inode() marks inode buffer dirty */
+	end_writeback(inode);
+
+	nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+	atomic_dec(&ii->i_root->inodes_count);
+
+	nilfs_clear_inode(inode);
+
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	nilfs_transaction_commit(sb);
@@ -639,17 +761,41 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	err = nilfs_transaction_begin(sb, &ti, 0);
 	if (unlikely(err))
 		return err;
-	err = inode_setattr(inode, iattr);
-	if (!err && (iattr->ia_valid & ATTR_MODE))
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		err = vmtruncate(inode, iattr->ia_size);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE) {
 		err = nilfs_acl_chmod(inode);
-	if (likely(!err))
-		err = nilfs_transaction_commit(sb);
-	else
-		nilfs_transaction_abort(sb);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	return nilfs_transaction_commit(sb);
 
+out_err:
+	nilfs_transaction_abort(sb);
 	return err;
 }
 
+int nilfs_permission(struct inode *inode, int mask)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	if ((mask & MAY_WRITE) && root &&
+	    root->cno != NILFS_CPTREE_CURRENT_CNO)
+		return -EROFS; /* snapshot is not writable */
+
+	return generic_permission(inode, mask, NULL);
+}
+
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
 			   struct buffer_head **pbh)
 {
@@ -659,8 +805,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
 	spin_lock(&sbi->s_inode_lock);
 	if (ii->i_bh == NULL) {
 		spin_unlock(&sbi->s_inode_lock);
-		err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
-						  pbh);
+		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
+						  inode->i_ino, pbh);
 		if (unlikely(err))
 			return err;
 		spin_lock(&sbi->s_inode_lock);
@@ -740,7 +886,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 	}
 	nilfs_update_inode(inode, ibh);
 	nilfs_mdt_mark_buffer_dirty(ibh);
-	nilfs_mdt_mark_dirty(sbi->s_ifile);
+	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
 	return 0;
 }
@@ -758,6 +904,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 void nilfs_dirty_inode(struct inode *inode)
 {
 	struct nilfs_transaction_info ti;
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
 	if (is_bad_inode(inode)) {
 		nilfs_warning(inode->i_sb, __func__,
@@ -765,6 +912,10 @@ void nilfs_dirty_inode(struct inode *inode)
 		dump_stack();
 		return;
 	}
+	if (mdi) {
+		nilfs_mdt_mark_dirty(inode);
+		return;
+	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	nilfs_mark_inode_dirty(inode);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b..3e90f86d5bf 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
 
 #include <linux/fs.h>
 #include <linux/wait.h>
-#include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
 #include <linux/slab.h>
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
 		goto out;
 
-	mutex_lock(&nilfs->ns_mount_mutex);
+	down_read(&inode->i_sb->s_umount);
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 	else
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 
-	mutex_unlock(&nilfs->ns_mount_mutex);
+	up_read(&inode->i_sb->s_umount);
 out:
 	mnt_drop_write(filp->f_path.mnt);
 	return ret;
@@ -334,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
 	return 0;
 }
 
-static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
 {
 	size_t nmembs = argv->v_nmembs;
@@ -349,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
 	for (i = 0, vdesc = buf; i < nmembs; ) {
 		ino = vdesc->vd_ino;
 		cno = vdesc->vd_cno;
-		inode = nilfs_gc_iget(nilfs, ino, cno);
+		inode = nilfs_iget_for_gc(sb, ino, cno);
 		if (unlikely(inode == NULL)) {
 			ret = -ENOMEM;
 			goto failed;
@@ -357,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
 		do {
 			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
 							   &buffers);
-			if (unlikely(ret < 0))
+			if (unlikely(ret < 0)) {
+				iput(inode);
 				goto failed;
+			}
 			vdesc++;
 		} while (++i < nmembs &&
 			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+
+		iput(inode); /* The inode still remains in GC inode list */
 	}
 
 	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -567,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	}
 
 	/*
-	 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+	 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
 	 * which will operates an inode list without blocking.
 	 * To protect the list from concurrent operations,
 	 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +580,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		goto out_free;
 	}
 
-	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
 	if (ret < 0)
 		printk(KERN_ERR "NILFS: GC failed during preparation: "
 			"cannot read source blocks: err=%d\n", ret);
 	else
 		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
 
-	if (ret < 0)
-		nilfs_remove_all_gcinode(nilfs);
+	nilfs_remove_all_gcinodes(nilfs);
 	clear_nilfs_gc_running(nilfs);
 
 out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb..39a5b84e2c9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include "nilfs.h"
+#include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
@@ -35,7 +36,6 @@
 
 #define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
 
-#define INIT_UNUSED_INODE_FIELDS
 
 static int
 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -77,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 						     struct buffer_head *,
 						     void *))
 {
-	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_transaction_info ti;
 	struct buffer_head *bh;
 	int err;
 
-	if (!sb) {
-		/*
-		 * Make sure this function is not called from any
-		 * read-only context.
-		 */
-		if (!nilfs->ns_writer) {
-			WARN_ON(1);
-			err = -EROFS;
-			goto out;
-		}
-		sb = nilfs->ns_writer->s_super;
-	}
-
 	nilfs_transaction_begin(sb, &ti, 0);
 
 	err = -ENOMEM;
@@ -111,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 	if (buffer_uptodate(bh))
 		goto failed_bh;
 
-	bh->b_bdev = nilfs->ns_bdev;
+	bh->b_bdev = sb->s_bdev;
 	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
 	if (likely(!err)) {
 		get_bh(bh);
@@ -128,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 		err = nilfs_transaction_commit(sb);
 	else
 		nilfs_transaction_abort(sb);
- out:
+
 	return err;
 }
 
@@ -166,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		unlock_buffer(bh);
 		goto failed_bh;
 	}
-	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-	bh->b_blocknr = (sector_t)blknum;
-	set_buffer_mapped(bh);
+	map_bh(bh, inode->i_sb, (sector_t)blknum);
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
@@ -397,35 +381,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
 static int
 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *inode = container_of(page->mapping,
-					   struct inode, i_data);
-	struct super_block *sb = inode->i_sb;
-	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
-	struct nilfs_sb_info *writer = NULL;
+	struct inode *inode;
+	struct super_block *sb;
 	int err = 0;
 
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 
-	if (page->mapping->assoc_mapping)
-		return 0; /* Do not request flush for shadow page cache */
-	if (!sb) {
-		down_read(&nilfs->ns_writer_sem);
-		writer = nilfs->ns_writer;
-		if (!writer) {
-			up_read(&nilfs->ns_writer_sem);
-			return -EROFS;
-		}
-		sb = writer->s_super;
-	}
+	inode = page->mapping->host;
+	if (!inode)
+		return 0;
+
+	sb = inode->i_sb;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		err = nilfs_construct_segment(sb);
 	else if (wbc->for_reclaim)
 		nilfs_flush_segment(sb, inode->i_ino);
 
-	if (writer)
-		up_read(&nilfs->ns_writer_sem);
 	return err;
 }
 
@@ -438,105 +411,27 @@ static const struct address_space_operations def_mdt_aops = {
 static const struct inode_operations def_mdt_iops;
 static const struct file_operations def_mdt_fops;
 
-/*
- * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
- * ifile, or gcinodes.  This allows the B-tree code and segment constructor
- * to treat them like regular files, and this helps to simplify the
- * implementation.
- *   On the other hand, some of the pseudo inodes have an irregular point:
- * They don't have valid inode->i_sb pointer because their lifetimes are
- * longer than those of the super block structs; they may continue for
- * several consecutive mounts/umounts.  This would need discussions.
- */
-/**
- * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
- * @nilfs: nilfs object
- * @sb: super block instance the metadata file belongs to
- * @ino: inode number
- * @gfp_mask: gfp mask for data pages
- * @objsz: size of the private object attached to inode->i_private
- */
-struct inode *
-nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-		     ino_t ino, gfp_t gfp_mask, size_t objsz)
+
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 {
-	struct inode *inode = nilfs_alloc_inode_common(nilfs);
+	struct nilfs_mdt_info *mi;
 
-	if (!inode)
-		return NULL;
-	else {
-		struct address_space * const mapping = &inode->i_data;
-		struct nilfs_mdt_info *mi;
-
-		mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-		if (!mi) {
-			nilfs_destroy_inode(inode);
-			return NULL;
-		}
-		mi->mi_nilfs = nilfs;
-		init_rwsem(&mi->mi_sem);
-
-		inode->i_sb = sb; /* sb may be NULL for some meta data files */
-		inode->i_blkbits = nilfs->ns_blocksize_bits;
-		inode->i_flags = 0;
-		atomic_set(&inode->i_count, 1);
-		inode->i_nlink = 1;
-		inode->i_ino = ino;
-		inode->i_mode = S_IFREG;
-		inode->i_private = mi;
-
-#ifdef INIT_UNUSED_INODE_FIELDS
-		atomic_set(&inode->i_writecount, 0);
-		inode->i_size = 0;
-		inode->i_blocks = 0;
-		inode->i_bytes = 0;
-		inode->i_generation = 0;
-#ifdef CONFIG_QUOTA
-		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
-#endif
-		inode->i_pipe = NULL;
-		inode->i_bdev = NULL;
-		inode->i_cdev = NULL;
-		inode->i_rdev = 0;
-#ifdef CONFIG_SECURITY
-		inode->i_security = NULL;
-#endif
-		inode->dirtied_when = 0;
-
-		INIT_LIST_HEAD(&inode->i_list);
-		INIT_LIST_HEAD(&inode->i_sb_list);
-		inode->i_state = 0;
-#endif
-
-		spin_lock_init(&inode->i_lock);
-		mutex_init(&inode->i_mutex);
-		init_rwsem(&inode->i_alloc_sem);
-
-		mapping->host = NULL;  /* instead of inode */
-		mapping->flags = 0;
-		mapping_set_gfp_mask(mapping, gfp_mask);
-		mapping->assoc_mapping = NULL;
-		mapping->backing_dev_info = nilfs->ns_bdi;
-
-		inode->i_mapping = mapping;
-	}
+	mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
+	if (!mi)
+		return -ENOMEM;
 
-	return inode;
-}
+	init_rwsem(&mi->mi_sem);
+	inode->i_private = mi;
 
-struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino, size_t objsz)
-{
-	struct inode *inode;
-
-	inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
-	if (!inode)
-		return NULL;
+	inode->i_mode = S_IFREG;
+	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
+	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	inode->i_op = &def_mdt_iops;
 	inode->i_fop = &def_mdt_fops;
 	inode->i_mapping->a_ops = &def_mdt_aops;
-	return inode;
+
+	return 0;
 }
 
 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -549,34 +444,159 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
 	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
-void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+static const struct address_space_operations shadow_map_aops = {
+	.sync_page		= block_sync_page,
+};
+
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow)
 {
-	shadow->i_mapping->assoc_mapping = orig->i_mapping;
-	NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
-		&NILFS_I(orig)->i_btnode_cache;
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
+
+	INIT_LIST_HEAD(&shadow->frozen_buffers);
+	nilfs_mapping_init_once(&shadow->frozen_data);
+	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+	mi->mi_shadow = shadow;
+	return 0;
 }
 
-static void nilfs_mdt_clear(struct inode *inode)
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
 {
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
 	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+	int ret;
 
-	invalidate_mapping_pages(inode->i_mapping, 0, -1);
-	truncate_inode_pages(inode->i_mapping, 0);
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+	if (ret)
+		goto out;
+
+	ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+				     &ii->i_btnode_cache);
+	if (ret)
+		goto out;
 
-	if (test_bit(NILFS_I_BMAP, &ii->i_state))
-		nilfs_bmap_clear(ii->i_bmap);
-	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+	nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
+ out:
+	return ret;
 }
 
-void nilfs_mdt_destroy(struct inode *inode)
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 {
-	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen;
+	struct page *page;
+	int blkbits = inode->i_blkbits;
+	int ret = -ENOMEM;
+
+	page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+	if (!page)
+		return ret;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, 0);
+
+	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+	if (bh_frozen) {
+		if (!buffer_uptodate(bh_frozen))
+			nilfs_copy_buffer(bh_frozen, bh);
+		if (list_empty(&bh_frozen->b_assoc_buffers)) {
+			list_add_tail(&bh_frozen->b_assoc_buffers,
+				      &shadow->frozen_buffers);
+			set_buffer_nilfs_redirected(bh);
+		} else {
+			brelse(bh_frozen); /* already frozen */
+		}
+		ret = 0;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+	struct buffer_head *bh_frozen = NULL;
+	struct page *page;
+	int n;
+
+	page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+	if (page) {
+		if (page_has_buffers(page)) {
+			n = bh_offset(bh) >> inode->i_blkbits;
+			bh_frozen = nilfs_page_get_nth_block(page, n);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return bh_frozen;
+}
+
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+	struct list_head *head = &shadow->frozen_buffers;
+	struct buffer_head *bh;
+
+	while (!list_empty(head)) {
+		bh = list_first_entry(head, struct buffer_head,
+				      b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh); /* drop ref-count to make it releasable */
+	}
+}
+
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
+
+	down_write(&mi->mi_sem);
 
-	if (mdi->mi_palloc_cache)
-		nilfs_palloc_destroy_cache(inode);
-	nilfs_mdt_clear(inode);
+	if (mi->mi_palloc_cache)
+		nilfs_palloc_clear_cache(inode);
+
+	nilfs_clear_dirty_pages(inode->i_mapping);
+	nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+
+	nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+	nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+
+	nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+
+	up_write(&mi->mi_sem);
+}
+
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+	struct nilfs_shadow_map *shadow = mi->mi_shadow;
 
-	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
-	kfree(mdi);
-	nilfs_destroy_inode(inode);
+	down_write(&mi->mi_sem);
+	nilfs_release_frozen_buffers(shadow);
+	truncate_inode_pages(&shadow->frozen_data, 0);
+	truncate_inode_pages(&shadow->frozen_btnodes, 0);
+	up_write(&mi->mi_sem);
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470f..b13734bf352 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
 #include "nilfs.h"
 #include "page.h"
 
+struct nilfs_shadow_map {
+	struct nilfs_bmap_store bmap_store;
+	struct address_space frozen_data;
+	struct address_space frozen_btnodes;
+	struct list_head frozen_buffers;
+};
+
 /**
  * struct nilfs_mdt_info - on-memory private data of meta data files
- * @mi_nilfs: back pointer to the_nilfs struct
  * @mi_sem: reader/writer semaphore for meta data operations
  * @mi_bgl: per-blockgroup locking
  * @mi_entry_size: size of an entry
  * @mi_first_entry_offset: offset to the first entry
  * @mi_entries_per_block: number of entries in a block
  * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
  * @mi_blocks_per_group: number of blocks in a group
  * @mi_blocks_per_desc_block: number of blocks per descriptor block
  */
 struct nilfs_mdt_info {
-	struct the_nilfs       *mi_nilfs;
 	struct rw_semaphore	mi_sem;
 	struct blockgroup_lock *mi_bgl;
 	unsigned		mi_entry_size;
 	unsigned		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
 	struct nilfs_palloc_cache *mi_palloc_cache;
+	struct nilfs_shadow_map *mi_shadow;
 	unsigned long		mi_blocks_per_group;
 	unsigned long		mi_blocks_per_desc_block;
 };
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-
-	return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+	return NILFS_SB(inode->i_sb)->s_nilfs;
 }
 
 /* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
-			    size_t);
-struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-				   ino_t, gfp_t, size_t);
-void nilfs_mdt_destroy(struct inode *);
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
-void nilfs_mdt_set_shadow(struct inode *, struct inode *);
 
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+			       struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+						struct buffer_head *bh);
 
 #define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
 
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-	return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+	return NILFS_I_NILFS(inode)->ns_cno;
 }
 
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b..6e9557ecf16 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
 
 #include <linux/pagemap.h>
 #include "nilfs.h"
+#include "export.h"
 
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+	(offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE	(sizeof(struct nilfs_fid) / 4)
 
 static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	ino = nilfs_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
-		inode = nilfs_iget(dir->i_sb, ino);
+		inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
 	return d_splice_alias(inode, dentry);
 }
 
-struct dentry *nilfs_get_parent(struct dentry *child)
-{
-	unsigned long ino;
-	struct inode *inode;
-	struct qstr dotdot = {.name = "..", .len = 2};
-
-	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
-	if (!ino)
-		return ERR_PTR(-ENOENT);
-
-	inode = nilfs_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	return d_obtain_alias(inode);
-}
-
 /*
  * By the time this is called, we already have created
  * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	inode->i_ctime = CURRENT_TIME;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	err = nilfs_add_nondir(dentry, inode);
 	if (!err)
@@ -468,6 +456,115 @@ out:
 	return err;
 }
 
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct qstr dotdot = {.name = "..", .len = 2};
+	struct nilfs_root *root;
+
+	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	root = NILFS_I(child->d_inode)->i_root;
+
+	inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+				       u64 ino, u32 gen)
+{
+	struct nilfs_root *root;
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	if (!root)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, root, ino);
+	nilfs_put_root(root);
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (gen && inode->i_generation != gen) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+	     fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+	    (fh_type != FILEID_NILFS_WITH_PARENT &&
+	     fh_type != FILEID_NILFS_WITHOUT_PARENT))
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+
+	if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+	    fh_type != FILEID_NILFS_WITH_PARENT)
+		return NULL;
+
+	return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+
+static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+			   int connectable)
+{
+	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	int type;
+
+	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
+	    (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+		return 255;
+
+	fid->cno = root->cno;
+	fid->ino = inode->i_ino;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		fid->parent_ino = parent->i_ino;
+		fid->parent_gen = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		type = FILEID_NILFS_WITH_PARENT;
+		*lenp = NILFS_FID_SIZE_CONNECTABLE;
+	} else {
+		type = FILEID_NILFS_WITHOUT_PARENT;
+		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+	}
+
+	return type;
+}
+
 const struct inode_operations nilfs_dir_inode_operations = {
 	.create		= nilfs_create,
 	.lookup		= nilfs_lookup,
@@ -491,4 +588,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.permission     = nilfs_permission,
+};
+
+const struct export_operations nilfs_export_ops = {
+	.encode_fh = nilfs_encode_fh,
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
 };
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 47d6d792812..f7560da5a56 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
 #include "the_nilfs.h"
 #include "sb.h"
 #include "bmap.h"
-#include "bmap_union.h"
 
 /*
  * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
 	__u32 i_flags;
 	unsigned long  i_state;		/* Dynamic state flags */
 	struct nilfs_bmap *i_bmap;
-	union nilfs_bmap_union i_bmap_union;
+	struct nilfs_bmap i_bmap_data;
 	__u64 i_xattr;	/* sector_t ??? */
 	__u32 i_dir_start_lookup;
 	__u64 i_cno;		/* check point number for GC inode */
@@ -60,6 +59,7 @@ struct nilfs_inode_info {
 #endif
 	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
 					   disk inode */
+	struct nilfs_root *i_root;
 	struct inode vfs_inode;
 };
 
@@ -71,9 +71,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
 static inline struct nilfs_inode_info *
 NILFS_BMAP_I(const struct nilfs_bmap *bmap)
 {
-	return container_of((union nilfs_bmap_union *)bmap,
-			    struct nilfs_inode_info,
-			    i_bmap_union);
+	return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
 }
 
 static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -103,7 +101,14 @@ enum {
 	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
 	NILFS_I_BMAP,			/* has bmap and btnode_cache */
 	NILFS_I_GCINODE,		/* inode for GC, on memory only */
-	NILFS_I_GCDAT,			/* shadow DAT, on memory only */
+};
+
+/*
+ * commit flags for nilfs_commit_super and nilfs_sync_super
+ */
+enum {
+	NILFS_SB_COMMIT = 0,	/* Commit a super block alternately */
+	NILFS_SB_COMMIT_ALL	/* Commit both super blocks */
 };
 
 /*
@@ -187,7 +192,7 @@ static inline int nilfs_doing_construction(void)
 
 static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
 {
-	return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+	return nilfs->ns_dat;
 }
 
 /*
@@ -195,12 +200,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
  */
 #ifdef CONFIG_NILFS_POSIX_ACL
 #error "NILFS: not yet supported POSIX ACL"
-extern int nilfs_permission(struct inode *, int, struct nameidata *);
 extern int nilfs_acl_chmod(struct inode *);
 extern int nilfs_init_acl(struct inode *, struct inode *);
 #else
-#define nilfs_permission   NULL
-
 static inline int nilfs_acl_chmod(struct inode *inode)
 {
 	return 0;
@@ -242,11 +244,19 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
-extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+			    unsigned long ino);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+				unsigned long ino);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+			 unsigned long ino);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+				       unsigned long ino, __u64 cno);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
-extern void nilfs_delete_inode(struct inode *);
+extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
+int nilfs_permission(struct inode *inode, int mask);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
 				  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
@@ -255,11 +265,7 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
 
-/* namei.c */
-extern struct dentry *nilfs_get_parent(struct dentry *);
-
 /* super.c */
-extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -270,9 +276,17 @@ extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
 					struct nilfs_super_block *, char *);
+extern int nilfs_check_feature_compatibility(struct super_block *,
+					     struct nilfs_super_block *);
+extern void nilfs_set_log_cursor(struct nilfs_super_block *,
+				 struct the_nilfs *);
+extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
+						      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
-extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
-extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+extern int nilfs_cleanup_super(struct nilfs_sb_info *);
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+			    struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
 
 /* gcinode.c */
 int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -280,16 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
 int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
 				   struct buffer_head **);
 int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
-int nilfs_init_gccache(struct the_nilfs *);
-void nilfs_destroy_gccache(struct the_nilfs *);
-void nilfs_clear_gcinode(struct inode *);
-struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
-void nilfs_remove_all_gcinode(struct the_nilfs *);
-
-/* gcdat.c */
-int nilfs_init_gcdat_inode(struct the_nilfs *);
-void nilfs_commit_gcdat_inode(struct the_nilfs *);
-void nilfs_clear_gcdat_inode(struct the_nilfs *);
+int nilfs_init_gcinode(struct inode *inode);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
 
 /*
  * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e4813..a6c3c2e817f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
 
 #define NILFS_BUFFER_INHERENT_BITS  \
 	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
-	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
+	 (1UL << BH_NILFS_Checked))
 
 static struct buffer_head *
 __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -78,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 {
 	int blkbits = inode->i_blkbits;
 	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
-	struct page *page, *opage;
-	struct buffer_head *bh, *obh;
+	struct page *page;
+	struct buffer_head *bh;
 
 	page = grab_cache_page(mapping, index);
 	if (unlikely(!page))
@@ -91,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 		page_cache_release(page);
 		return NULL;
 	}
-	if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
-		/*
-		 * Shadow page cache uses assoc_mapping to point its original
-		 * page cache.  The following code tries the original cache
-		 * if the given cache is a shadow and it didn't hit.
-		 */
-		opage = find_lock_page(mapping->assoc_mapping, index);
-		if (!opage)
-			return bh;
-
-		obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
-					     b_state);
-		if (buffer_uptodate(obh)) {
-			nilfs_copy_buffer(bh, obh);
-			if (buffer_dirty(obh)) {
-				nilfs_mark_buffer_dirty(bh);
-				if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
-					nilfs_mdt_mark_dirty(inode);
-			}
-		}
-		brelse(obh);
-		unlock_page(opage);
-		page_cache_release(opage);
-	}
 	return bh;
 }
 
@@ -129,6 +106,8 @@ void nilfs_forget_buffer(struct buffer_head *bh)
 
 	lock_buffer(bh);
 	clear_buffer_nilfs_volatile(bh);
+	clear_buffer_nilfs_checked(bh);
+	clear_buffer_nilfs_redirected(bh);
 	clear_buffer_dirty(bh);
 	if (nilfs_page_buffers_clean(page))
 		__nilfs_clear_page_dirty(page);
@@ -480,6 +459,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
 				lock_buffer(bh);
 				clear_buffer_dirty(bh);
 				clear_buffer_nilfs_volatile(bh);
+				clear_buffer_nilfs_checked(bh);
+				clear_buffer_nilfs_redirected(bh);
 				clear_buffer_uptodate(bh);
 				clear_buffer_mapped(bh);
 				unlock_buffer(bh);
@@ -510,6 +491,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	}
 	return nc;
 }
+ 
+void nilfs_mapping_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+
+void nilfs_mapping_init(struct address_space *mapping,
+			struct backing_dev_info *bdi,
+			const struct address_space_operations *aops)
+{
+	mapping->host = NULL;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->assoc_mapping = NULL;
+	mapping->backing_dev_info = bdi;
+	mapping->a_ops = aops;
+}
 
 /*
  * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f..fb9e8a8a203 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,15 @@ enum {
 	BH_NILFS_Allocated = BH_PrivateStart,
 	BH_NILFS_Node,
 	BH_NILFS_Volatile,
+	BH_NILFS_Checked,
+	BH_NILFS_Redirected,
 };
 
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)	/* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+BUFFER_FNS(NILFS_Checked, nilfs_checked)	/* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)	/* redirected to a copy */
 
 
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -57,6 +61,10 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_mapping_init_once(struct address_space *mapping);
+void nilfs_mapping_init(struct address_space *mapping,
+			struct backing_dev_info *bdi,
+			const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 
 #define NILFS_PAGE_BUG(page, m, a...) \
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index bae2a516b4e..5d2711c28da 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err)
 	return -EINVAL;
 }
 
-static void store_segsum_info(struct nilfs_segsum_info *ssi,
-			      struct nilfs_segment_summary *sum,
-			      unsigned int blocksize)
-{
-	ssi->flags = le16_to_cpu(sum->ss_flags);
-	ssi->seg_seq = le64_to_cpu(sum->ss_seq);
-	ssi->ctime = le64_to_cpu(sum->ss_create);
-	ssi->next = le64_to_cpu(sum->ss_next);
-	ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
-	ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
-	ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
-
-	ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
-	ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
-
-	/* need to verify ->ss_bytes field if read ->ss_cno */
-}
-
 /**
- * calc_crc_cont - check CRC of blocks continuously
- * @sbi: nilfs_sb_info
+ * nilfs_compute_checksum - compute checksum of blocks continuously
+ * @nilfs: nilfs object
  * @bhs: buffer head of start block
  * @sum: place to store result
  * @offset: offset bytes in the first block
@@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
  * @start: DBN of start block
  * @nblock: number of blocks to be checked
  */
-static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
-			 u32 *sum, unsigned long offset, u64 check_bytes,
-			 sector_t start, unsigned long nblock)
+static int nilfs_compute_checksum(struct the_nilfs *nilfs,
+				  struct buffer_head *bhs, u32 *sum,
+				  unsigned long offset, u64 check_bytes,
+				  sector_t start, unsigned long nblock)
 {
-	unsigned long blocksize = sbi->s_super->s_blocksize;
+	unsigned int blocksize = nilfs->ns_blocksize;
 	unsigned long size;
 	u32 crc;
 
 	BUG_ON(offset >= blocksize);
 	check_bytes -= offset;
 	size = min_t(u64, check_bytes, blocksize - offset);
-	crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+	crc = crc32_le(nilfs->ns_crc_seed,
 		       (unsigned char *)bhs->b_data + offset, size);
 	if (--nblock > 0) {
 		do {
-			struct buffer_head *bh
-				= sb_bread(sbi->s_super, ++start);
+			struct buffer_head *bh;
+
+			bh = __bread(nilfs->ns_bdev, ++start, blocksize);
 			if (!bh)
 				return -EIO;
 			check_bytes -= size;
@@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
 
 /**
  * nilfs_read_super_root_block - read super root block
- * @sb: super_block
+ * @nilfs: nilfs object
  * @sr_block: disk block number of the super root block
  * @pbh: address of a buffer_head pointer to return super root buffer
  * @check: CRC check flag
  */
-int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
 				struct buffer_head **pbh, int check)
 {
 	struct buffer_head *bh_sr;
@@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 	int ret;
 
 	*pbh = NULL;
-	bh_sr = sb_bread(sb, sr_block);
+	bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
 	if (unlikely(!bh_sr)) {
 		ret = NILFS_SEG_FAIL_IO;
 		goto failed;
@@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 	if (check) {
 		unsigned bytes = le16_to_cpu(sr->sr_bytes);
 
-		if (bytes == 0 || bytes > sb->s_blocksize) {
+		if (bytes == 0 || bytes > nilfs->ns_blocksize) {
 			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
 			goto failed_bh;
 		}
-		if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
-				  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+		if (nilfs_compute_checksum(
+			    nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
+			    sr_block, 1)) {
 			ret = NILFS_SEG_FAIL_IO;
 			goto failed_bh;
 		}
@@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 }
 
 /**
- * load_segment_summary - read segment summary of the specified partial segment
- * @sbi: nilfs_sb_info
- * @pseg_start: start disk block number of partial segment
- * @seg_seq: sequence number requested
- * @ssi: pointer to nilfs_segsum_info struct to store information
+ * nilfs_read_log_header - read summary header of the specified log
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: pointer to return segment summary structure
  */
-static int
-load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
-		     u64 seg_seq, struct nilfs_segsum_info *ssi)
+static struct buffer_head *
+nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
+		      struct nilfs_segment_summary **sum)
 {
 	struct buffer_head *bh_sum;
-	struct nilfs_segment_summary *sum;
+
+	bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+	if (bh_sum)
+		*sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+	return bh_sum;
+}
+
+/**
+ * nilfs_validate_log - verify consistency of log
+ * @nilfs: nilfs object
+ * @seg_seq: sequence number of segment
+ * @bh_sum: buffer head of summary block
+ * @sum: segment summary struct
+ */
+static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
+			      struct buffer_head *bh_sum,
+			      struct nilfs_segment_summary *sum)
+{
 	unsigned long nblock;
 	u32 crc;
-	int ret = NILFS_SEG_FAIL_IO;
+	int ret;
 
-	bh_sum = sb_bread(sbi->s_super, pseg_start);
-	if (!bh_sum)
+	ret = NILFS_SEG_FAIL_MAGIC;
+	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
 		goto out;
 
-	sum = (struct nilfs_segment_summary *)bh_sum->b_data;
-
-	/* Check consistency of segment summary */
-	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
-		ret = NILFS_SEG_FAIL_MAGIC;
-		goto failed;
-	}
-	store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
-	if (seg_seq != ssi->seg_seq) {
-		ret = NILFS_SEG_FAIL_SEQ;
-		goto failed;
-	}
+	ret = NILFS_SEG_FAIL_SEQ;
+	if (le64_to_cpu(sum->ss_seq) != seg_seq)
+		goto out;
 
-	nblock = ssi->nblocks;
-	if (unlikely(nblock == 0 ||
-		     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+	nblock = le32_to_cpu(sum->ss_nblocks);
+	ret = NILFS_SEG_FAIL_CONSISTENCY;
+	if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
 		/* This limits the number of blocks read in the CRC check */
-		ret = NILFS_SEG_FAIL_CONSISTENCY;
-		goto failed;
-	}
-	if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
-			  ((u64)nblock << sbi->s_super->s_blocksize_bits),
-			  pseg_start, nblock)) {
-		ret = NILFS_SEG_FAIL_IO;
-		goto failed;
-	}
-	if (crc == le32_to_cpu(sum->ss_datasum))
-		ret = 0;
-	else
-		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
- failed:
-	brelse(bh_sum);
- out:
+		goto out;
+
+	ret = NILFS_SEG_FAIL_IO;
+	if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
+				   ((u64)nblock << nilfs->ns_blocksize_bits),
+				   bh_sum->b_blocknr, nblock))
+		goto out;
+
+	ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+	if (crc != le32_to_cpu(sum->ss_datasum))
+		goto out;
+	ret = 0;
+out:
 	return ret;
 }
 
-static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
-			unsigned int *offset, unsigned int bytes)
+/**
+ * nilfs_read_summary_info - read an item on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be read
+ */
+static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
+				     struct buffer_head **pbh,
+				     unsigned int *offset, unsigned int bytes)
 {
 	void *ptr;
 	sector_t blocknr;
@@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
 	if (bytes > (*pbh)->b_size - *offset) {
 		blocknr = (*pbh)->b_blocknr;
 		brelse(*pbh);
-		*pbh = sb_bread(sb, blocknr + 1);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + 1,
+			       nilfs->ns_blocksize);
 		if (unlikely(!*pbh))
 			return NULL;
 		*offset = 0;
@@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
 	return ptr;
 }
 
-static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
-			unsigned int *offset, unsigned int bytes,
-			unsigned long count)
+/**
+ * nilfs_skip_summary_info - skip items on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be skipped
+ * @count: number of items to be skipped
+ */
+static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
+				    struct buffer_head **pbh,
+				    unsigned int *offset, unsigned int bytes,
+				    unsigned long count)
 {
 	unsigned int rest_item_in_current_block
 		= ((*pbh)->b_size - *offset) / bytes;
@@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
 		*offset = bytes * (count - (bcnt - 1) * nitem_per_block);
 
 		brelse(*pbh);
-		*pbh = sb_bread(sb, blocknr + bcnt);
+		*pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
+			       nilfs->ns_blocksize);
 	}
 }
 
-static int
-collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
-			   struct nilfs_segsum_info *ssi,
-			   struct list_head *head)
+/**
+ * nilfs_scan_dsync_log - get block information of a log written for data sync
+ * @nilfs: nilfs object
+ * @start_blocknr: start block number of the log
+ * @sum: log summary information
+ * @head: list head to add nilfs_recovery_block struct
+ */
+static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
+				struct nilfs_segment_summary *sum,
+				struct list_head *head)
 {
 	struct buffer_head *bh;
 	unsigned int offset;
-	unsigned long nfinfo = ssi->nfinfo;
-	sector_t blocknr = sum_blocknr + ssi->nsumblk;
+	u32 nfinfo, sumbytes;
+	sector_t blocknr;
 	ino_t ino;
 	int err = -EIO;
 
+	nfinfo = le32_to_cpu(sum->ss_nfinfo);
 	if (!nfinfo)
 		return 0;
 
-	bh = sb_bread(sbi->s_super, sum_blocknr);
+	sumbytes = le32_to_cpu(sum->ss_sumbytes);
+	blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
+	bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
 	if (unlikely(!bh))
 		goto out;
 
-	offset = le16_to_cpu(
-		((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
+	offset = le16_to_cpu(sum->ss_bytes);
 	for (;;) {
 		unsigned long nblocks, ndatablk, nnodeblk;
 		struct nilfs_finfo *finfo;
 
-		finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+		finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+						sizeof(*finfo));
 		if (unlikely(!finfo))
 			goto out;
 
@@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
 			struct nilfs_recovery_block *rb;
 			struct nilfs_binfo_v *binfo;
 
-			binfo = segsum_get(sbi->s_super, &bh, &offset,
-					   sizeof(*binfo));
+			binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+							sizeof(*binfo));
 			if (unlikely(!binfo))
 				goto out;
 
@@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
 		}
 		if (--nfinfo == 0)
 			break;
-		blocknr += nnodeblk; /* always 0 for the data sync segments */
-		segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
-			    nnodeblk);
+		blocknr += nnodeblk; /* always 0 for data sync logs */
+		nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
+					nnodeblk);
 		if (unlikely(!bh))
 			goto out;
 	}
@@ -423,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 	segnum[2] = ri->ri_segnum;
 	segnum[3] = ri->ri_nextnum;
 
-	nilfs_attach_writer(nilfs, sbi);
 	/*
 	 * Releasing the next segment of the latest super root.
 	 * The next segment is invalidated by this recovery.
@@ -463,18 +479,17 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 
  failed:
 	/* No need to recover sufile because it will be destroyed on error */
-	nilfs_detach_writer(nilfs, sbi);
 	return err;
 }
 
-static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 				     struct nilfs_recovery_block *rb,
 				     struct page *page)
 {
 	struct buffer_head *bh_org;
 	void *kaddr;
 
-	bh_org = sb_bread(sbi->s_super, rb->blocknr);
+	bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
 	if (unlikely(!bh_org))
 		return -EIO;
 
@@ -485,19 +500,21 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
 	return 0;
 }
 
-static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
-				struct list_head *head,
-				unsigned long *nr_salvaged_blocks)
+static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
+				      struct nilfs_sb_info *sbi,
+				      struct nilfs_root *root,
+				      struct list_head *head,
+				      unsigned long *nr_salvaged_blocks)
 {
 	struct inode *inode;
 	struct nilfs_recovery_block *rb, *n;
-	unsigned blocksize = sbi->s_super->s_blocksize;
+	unsigned blocksize = nilfs->ns_blocksize;
 	struct page *page;
 	loff_t pos;
 	int err = 0, err2 = 0;
 
 	list_for_each_entry_safe(rb, n, head, list) {
-		inode = nilfs_iget(sbi->s_super, rb->ino);
+		inode = nilfs_iget(sbi->s_super, root, rb->ino);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			inode = NULL;
@@ -505,13 +522,16 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 		}
 
 		pos = rb->blkoff << inode->i_blkbits;
-		page = NULL;
-		err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
-					0, &page, NULL, nilfs_get_block);
-		if (unlikely(err))
+		err = block_write_begin(inode->i_mapping, pos, blocksize,
+					0, &page, nilfs_get_block);
+		if (unlikely(err)) {
+			loff_t isize = inode->i_size;
+			if (pos + blocksize > isize)
+				vmtruncate(inode, isize);
 			goto failed_inode;
+		}
 
-		err = nilfs_recovery_copy_block(sbi, rb, page);
+		err = nilfs_recovery_copy_block(nilfs, rb, page);
 		if (unlikely(err))
 			goto failed_page;
 
@@ -551,18 +571,21 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 /**
  * nilfs_do_roll_forward - salvage logical segments newer than the latest
  * checkpoint
+ * @nilfs: nilfs object
  * @sbi: nilfs_sb_info
- * @nilfs: the_nilfs
  * @ri: pointer to a nilfs_recovery_info
  */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				 struct nilfs_sb_info *sbi,
+				 struct nilfs_root *root,
 				 struct nilfs_recovery_info *ri)
 {
-	struct nilfs_segsum_info ssi;
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
 	sector_t pseg_start;
 	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
 	unsigned long nsalvaged_blocks = 0;
+	unsigned int flags;
 	u64 seg_seq;
 	__u64 segnum, nextnum = 0;
 	int empty_seg = 0;
@@ -574,15 +597,20 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 	};
 	int state = RF_INIT_ST;
 
-	nilfs_attach_writer(nilfs, sbi);
 	pseg_start = ri->ri_lsegs_start;
 	seg_seq = ri->ri_lsegs_start_seq;
 	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
 	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
 
 	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+		brelse(bh_sum);
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum) {
+			err = -EIO;
+			goto failed;
+		}
 
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO) {
 				err = -EIO;
@@ -590,33 +618,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 			}
 			goto strayed;
 		}
-		if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+
+		flags = le16_to_cpu(sum->ss_flags);
+		if (flags & NILFS_SS_SR)
 			goto confused;
 
 		/* Found a valid partial segment; do recovery actions */
-		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
 		empty_seg = 0;
-		nilfs->ns_ctime = ssi.ctime;
-		if (!(ssi.flags & NILFS_SS_GC))
-			nilfs->ns_nongc_ctime = ssi.ctime;
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
+		if (!(flags & NILFS_SS_GC))
+			nilfs->ns_nongc_ctime = nilfs->ns_ctime;
 
 		switch (state) {
 		case RF_INIT_ST:
-			if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+			if (!(flags & NILFS_SS_LOGBGN) ||
+			    !(flags & NILFS_SS_SYNDT))
 				goto try_next_pseg;
 			state = RF_DSYNC_ST;
 			/* Fall through */
 		case RF_DSYNC_ST:
-			if (!NILFS_SEG_DSYNC(&ssi))
+			if (!(flags & NILFS_SS_SYNDT))
 				goto confused;
 
-			err = collect_blocks_from_segsum(
-				sbi, pseg_start, &ssi, &dsync_blocks);
+			err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
+						   &dsync_blocks);
 			if (unlikely(err))
 				goto failed;
-			if (NILFS_SEG_LOGEND(&ssi)) {
-				err = recover_dsync_blocks(
-					sbi, &dsync_blocks, &nsalvaged_blocks);
+			if (flags & NILFS_SS_LOGEND) {
+				err = nilfs_recover_dsync_blocks(
+					nilfs, sbi, root, &dsync_blocks,
+					&nsalvaged_blocks);
 				if (unlikely(err))
 					goto failed;
 				state = RF_INIT_ST;
@@ -627,7 +660,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
  try_next_pseg:
 		if (pseg_start == ri->ri_lsegs_end)
 			break;
-		pseg_start += ssi.nblocks;
+		pseg_start += le32_to_cpu(sum->ss_nblocks);
 		if (pseg_start < seg_end)
 			continue;
 		goto feed_segment;
@@ -652,8 +685,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
 	}
  out:
+	brelse(bh_sum);
 	dispose_recovery_list(&dsync_blocks);
-	nilfs_detach_writer(sbi->s_nilfs, sbi);
 	return err;
 
  confused:
@@ -667,7 +700,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 }
 
 static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
-				      struct nilfs_sb_info *sbi,
 				      struct nilfs_recovery_info *ri)
 {
 	struct buffer_head *bh;
@@ -677,7 +709,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 	    nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
 		return;
 
-	bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+	bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
 	BUG_ON(!bh);
 	memset(bh->b_data, 0, bh->b_size);
 	set_buffer_dirty(bh);
@@ -690,9 +722,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 }
 
 /**
- * nilfs_recover_logical_segments - salvage logical segments written after
- * the latest super root
- * @nilfs: the_nilfs
+ * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
+ * @nilfs: nilfs object
  * @sbi: nilfs_sb_info
  * @ri: pointer to a nilfs_recovery_info struct to store search results.
  *
@@ -709,23 +740,24 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
-				   struct nilfs_sb_info *sbi,
-				   struct nilfs_recovery_info *ri)
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
+			      struct nilfs_sb_info *sbi,
+			      struct nilfs_recovery_info *ri)
 {
+	struct nilfs_root *root;
 	int err;
 
 	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
 		return 0;
 
-	err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+	err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
 	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: error loading the latest checkpoint.\n");
 		return err;
 	}
 
-	err = nilfs_do_roll_forward(nilfs, sbi, ri);
+	err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
 	if (unlikely(err))
 		goto failed;
 
@@ -737,7 +769,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 			goto failed;
 		}
 
-		err = nilfs_attach_segment_constructor(sbi);
+		err = nilfs_attach_segment_constructor(sbi, root);
 		if (unlikely(err))
 			goto failed;
 
@@ -751,18 +783,17 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 			goto failed;
 		}
 
-		nilfs_finish_roll_forward(nilfs, sbi, ri);
+		nilfs_finish_roll_forward(nilfs, ri);
 	}
 
  failed:
-	nilfs_detach_checkpoint(sbi);
+	nilfs_put_root(root);
 	return err;
 }
 
 /**
  * nilfs_search_super_root - search the latest valid super root
  * @nilfs: the_nilfs
- * @sbi: nilfs_sb_info
  * @ri: pointer to a nilfs_recovery_info struct to store search results.
  *
  * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -775,14 +806,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
  * %-EINVAL - No valid segment found
  *
  * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+int nilfs_search_super_root(struct the_nilfs *nilfs,
 			    struct nilfs_recovery_info *ri)
 {
-	struct nilfs_segsum_info ssi;
+	struct buffer_head *bh_sum = NULL;
+	struct nilfs_segment_summary *sum;
 	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
 	sector_t seg_start, seg_end; /* range of full segment (block number) */
 	sector_t b, end;
+	unsigned long nblocks;
+	unsigned int flags;
 	u64 seg_seq;
 	__u64 segnum, nextnum = 0;
 	__u64 cno;
@@ -801,17 +837,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 	/* Read ahead segment */
 	b = seg_start;
 	while (b <= seg_end)
-		sb_breadahead(sbi->s_super, b++);
+		__breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
 
 	for (;;) {
-		/* Load segment summary */
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+		brelse(bh_sum);
+		ret = NILFS_SEG_FAIL_IO;
+		bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+		if (!bh_sum)
+			goto failed;
+
+		ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO)
 				goto failed;
 			goto strayed;
 		}
-		pseg_end = pseg_start + ssi.nblocks - 1;
+
+		nblocks = le32_to_cpu(sum->ss_nblocks);
+		pseg_end = pseg_start + nblocks - 1;
 		if (unlikely(pseg_end > seg_end)) {
 			ret = NILFS_SEG_FAIL_CONSISTENCY;
 			goto strayed;
@@ -821,11 +864,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		ri->ri_pseg_start = pseg_start;
 		ri->ri_seq = seg_seq;
 		ri->ri_segnum = segnum;
-		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		nextnum = nilfs_get_segnum_of_block(nilfs,
+						    le64_to_cpu(sum->ss_next));
 		ri->ri_nextnum = nextnum;
 		empty_seg = 0;
 
-		if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+		flags = le16_to_cpu(sum->ss_flags);
+		if (!(flags & NILFS_SS_SR) && !scan_newer) {
 			/* This will never happen because a superblock
 			   (last_segment) always points to a pseg
 			   having a super root. */
@@ -836,14 +881,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		if (pseg_start == seg_start) {
 			nilfs_get_segment_range(nilfs, nextnum, &b, &end);
 			while (b <= end)
-				sb_breadahead(sbi->s_super, b++);
+				__breadahead(nilfs->ns_bdev, b++,
+					     nilfs->ns_blocksize);
 		}
-		if (!NILFS_SEG_HAS_SR(&ssi)) {
-			if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+		if (!(flags & NILFS_SS_SR)) {
+			if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
 				ri->ri_lsegs_start = pseg_start;
 				ri->ri_lsegs_start_seq = seg_seq;
 			}
-			if (NILFS_SEG_LOGEND(&ssi))
+			if (flags & NILFS_SS_LOGEND)
 				ri->ri_lsegs_end = pseg_start;
 			goto try_next_pseg;
 		}
@@ -854,12 +900,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
 
 		nilfs_dispose_segment_list(&segments);
-		nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
-			+ ssi.nblocks - seg_start;
+		sr_pseg_start = pseg_start;
+		nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
 		nilfs->ns_seg_seq = seg_seq;
 		nilfs->ns_segnum = segnum;
 		nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
-		nilfs->ns_ctime = ssi.ctime;
+		nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
 		nilfs->ns_nextnum = nextnum;
 
 		if (scan_newer)
@@ -870,15 +916,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 			scan_newer = 1;
 		}
 
-		/* reset region for roll-forward */
-		pseg_start += ssi.nblocks;
-		if (pseg_start < seg_end)
-			continue;
-		goto feed_segment;
-
  try_next_pseg:
 		/* Standing on a course, or met an inconsistent state */
-		pseg_start += ssi.nblocks;
+		pseg_start += nblocks;
 		if (pseg_start < seg_end)
 			continue;
 		goto feed_segment;
@@ -909,6 +949,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 
  super_root_found:
 	/* Updating pointers relating to the latest checkpoint */
+	brelse(bh_sum);
 	list_splice_tail(&segments, &ri->ri_used_segments);
 	nilfs->ns_last_pseg = sr_pseg_start;
 	nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -916,6 +957,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 	return 0;
 
  failed:
+	brelse(bh_sum);
 	nilfs_dispose_segment_list(&segments);
 	return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
 }
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504..35a07157b98 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,11 +42,6 @@ struct nilfs_sc_info;
  * NILFS super-block data in memory
  */
 struct nilfs_sb_info {
-	/* Snapshot status */
-	__u64 s_snapshot_cno;		/* Checkpoint number */
-	atomic_t s_inodes_count;
-	atomic_t s_blocks_count;	/* Reserved (might be deleted) */
-
 	/* Mount options */
 	unsigned long s_mount_opt;
 	uid_t s_resuid;
@@ -59,8 +54,6 @@ struct nilfs_sb_info {
 	/* Fundamental members */
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
-	struct list_head s_list;	/* list head for nilfs->ns_supers */
-	atomic_t s_count;		/* reference count */
 
 	/* Segment constructor */
 	struct list_head s_dirty_files;	/* dirty files list */
@@ -68,9 +61,6 @@ struct nilfs_sb_info {
 	spinlock_t s_inode_lock;	/* Lock for the nilfs inode.
 					   It covers s_dirty_files list */
 
-	/* Metadata files */
-	struct inode *s_ifile;		/* index file inode */
-
 	/* Inode allocator */
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2e6a2723b8f..0f83e93935b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 	struct bio *bio = wi->bio;
 	int err;
 
-	if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+	if (segbuf->sb_nbio > 0 &&
+	    bdi_write_congested(segbuf->sb_super->s_bdi)) {
 		wait_for_completion(&segbuf->sb_bio_event);
 		segbuf->sb_nbio--;
 		if (unlikely(atomic_read(&segbuf->sb_err))) {
@@ -508,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
-		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+		rw |= REQ_SYNC | REQ_UNPLUG;
 		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 85fbb66455e..b04f08cc239 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -54,17 +54,6 @@ struct nilfs_segsum_info {
 	sector_t		next;
 };
 
-/* macro for the flags */
-#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
-#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
-#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
-#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
-#define NILFS_SEG_SIMPLEX(sum) \
-	(((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
-	 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
-
-#define NILFS_SEG_EMPTY(sum)	((sum)->nblocks == (sum)->nsumblk)
-
 /**
  * struct nilfs_segment_buffer - Segment buffer
  * @sb_super: back pointer to a superblock struct
@@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
 				struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
 
+static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
+{
+	unsigned int flags = segbuf->sb_sum.flags;
+
+	return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
+		(NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
+}
+
+static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
+{
+	return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
+}
+
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
 			       struct buffer_head *bh)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9201649cc4..687d090cea3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
 	if (ret > 0)
 		return 0;
 
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+
 	sbi = NILFS_SB(sb);
 	nilfs = sbi->s_nilfs;
 	down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 
 	if (nilfs_doing_gc())
 		flags = NILFS_SS_GC;
-	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
-				 sci->sc_sbi->s_nilfs->ns_cno);
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
 	if (unlikely(err))
 		return err;
 
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
 	struct nilfs_finfo *finfo;
 	struct nilfs_inode_info *ii;
 	struct nilfs_segment_buffer *segbuf;
+	__u64 cno;
 
 	if (sci->sc_blk_cnt == 0)
 		return;
 
 	ii = NILFS_I(inode);
+
+	if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+		cno = ii->i_cno;
+	else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+		cno = 0;
+	else
+		cno = sci->sc_cno;
+
 	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
 						 sizeof(*finfo));
 	finfo->fi_ino = cpu_to_le64(inode->i_ino);
 	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
 	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
-	finfo->fi_cno = cpu_to_le64(ii->i_cno);
+	finfo->fi_cno = cpu_to_le64(cno);
 
 	segbuf = sci->sc_curseg;
 	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -755,12 +765,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
 	}
 }
 
-static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+				     struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int ret = 0;
 
-	if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+	if (nilfs_mdt_fetch_dirty(root->ifile))
 		ret++;
 	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
 		ret++;
@@ -785,7 +795,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	int ret = 0;
 
-	if (nilfs_test_metadata_dirty(sbi))
+	if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
 		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 
 	spin_lock(&sbi->s_inode_lock);
@@ -801,7 +811,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_mdt_clear_dirty(sbi->s_ifile);
+	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
 	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
 	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
 	nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -848,9 +858,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	raw_cp->cp_snapshot_list.ssl_next = 0;
 	raw_cp->cp_snapshot_list.ssl_prev = 0;
 	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+		cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
 	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+		cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
 	raw_cp->cp_nblk_inc =
 		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +871,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	else
 		nilfs_checkpoint_set_minor(raw_cp);
 
-	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+	nilfs_write_inode_common(sci->sc_root->ifile,
+				 &raw_cp->cp_ifile_inode, 1);
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
 
@@ -886,13 +897,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
 	}
 }
 
-static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
-					    struct inode *ifile)
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
 {
 	struct nilfs_inode_info *ii;
 
 	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
-		nilfs_fill_in_file_bmap(ifile, ii);
+		nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
 		set_bit(NILFS_I_COLLECTED, &ii->i_state);
 	}
 }
@@ -1135,7 +1145,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
 		/* Fall through */
 	case NILFS_ST_IFILE:
-		err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+		err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
 			break;
@@ -1599,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (!TestSetPageWriteback(clone_page))
-		inc_zone_page_state(clone_page, NR_WRITEBACK);
+		account_page_writeback(clone_page);
 	unlock_page(clone_page);
 
 	return 0;
@@ -1900,6 +1910,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
 			clear_buffer_nilfs_volatile(bh);
+			clear_buffer_nilfs_redirected(bh);
 			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
@@ -1914,12 +1925,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 			}
 		}
 
-		if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
-			if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+		if (!nilfs_segbuf_simplex(segbuf)) {
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
 				set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
 				sci->sc_lseg_stime = jiffies;
 			}
-			if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+			if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
 				clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
 		}
 	}
@@ -1936,11 +1947,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 
 	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
 
-	if (nilfs_doing_gc()) {
+	if (nilfs_doing_gc())
 		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
-		if (update_sr)
-			nilfs_commit_gcdat_inode(nilfs);
-	} else
+	else
 		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
 
 	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1951,7 +1960,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 	if (update_sr) {
 		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
 				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-		set_nilfs_sb_dirty(nilfs);
 
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -1977,7 +1985,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 					struct nilfs_sb_info *sbi)
 {
 	struct nilfs_inode_info *ii, *n;
-	__u64 cno = sbi->s_nilfs->ns_cno;
+	struct inode *ifile = sci->sc_root->ifile;
 
 	spin_lock(&sbi->s_inode_lock);
  retry:
@@ -1988,14 +1996,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 
 			spin_unlock(&sbi->s_inode_lock);
 			err = nilfs_ifile_get_inode_block(
-				sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+				ifile, ii->vfs_inode.i_ino, &ibh);
 			if (unlikely(err)) {
 				nilfs_warning(sbi->s_super, __func__,
 					      "failed to get inode block.\n");
 				return err;
 			}
 			nilfs_mdt_mark_buffer_dirty(ibh);
-			nilfs_mdt_mark_dirty(sbi->s_ifile);
+			nilfs_mdt_mark_dirty(ifile);
 			spin_lock(&sbi->s_inode_lock);
 			if (likely(!ii->i_bh))
 				ii->i_bh = ibh;
@@ -2003,7 +2011,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 				brelse(ibh);
 			goto retry;
 		}
-		ii->i_cno = cno;
 
 		clear_bit(NILFS_I_QUEUED, &ii->i_state);
 		set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2012,8 +2019,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 	}
 	spin_unlock(&sbi->s_inode_lock);
 
-	NILFS_I(sbi->s_ifile)->i_cno = cno;
-
 	return 0;
 }
 
@@ -2022,19 +2027,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
 	struct nilfs_inode_info *ii, *n;
-	__u64 cno = sbi->s_nilfs->ns_cno;
 
 	spin_lock(&sbi->s_inode_lock);
 	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
 		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
-		    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
-			/* The current checkpoint number (=nilfs->ns_cno) is
-			   changed between check-in and check-out only if the
-			   super root is written out.  So, we can update i_cno
-			   for the inodes that remain in the dirty list. */
-			ii->i_cno = cno;
+		    test_bit(NILFS_I_DIRTY, &ii->i_state))
 			continue;
-		}
+
 		clear_bit(NILFS_I_BUSY, &ii->i_state);
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
@@ -2055,12 +2054,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	int err;
 
 	sci->sc_stage.scnt = NILFS_ST_INIT;
+	sci->sc_cno = nilfs->ns_cno;
 
 	err = nilfs_segctor_check_in_files(sci, sbi);
 	if (unlikely(err))
 		goto out;
 
-	if (nilfs_test_metadata_dirty(sbi))
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
 		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 
 	if (nilfs_segctor_clean(sci))
@@ -2082,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		/* Avoid empty segment */
 		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
-		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+		    nilfs_segbuf_empty(sci->sc_curseg)) {
 			nilfs_segctor_abort_construction(sci, nilfs, 1);
 			goto out;
 		}
@@ -2092,7 +2092,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			goto failed;
 
 		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
-			nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+			nilfs_segctor_fill_in_file_bmap(sci);
 
 		if (mode == SC_LSEG_SR &&
 		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2408,6 +2408,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp;
 	int err = 0;
 
 	nilfs_segctor_accept(sci);
@@ -2423,8 +2424,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			err = nilfs_commit_super(
-				sbi, nilfs_altsb_need_update(nilfs));
+			err = -EIO;
+			sbp = nilfs_prepare_super(sbi,
+						  nilfs_sb_will_flip(nilfs));
+			if (likely(sbp)) {
+				nilfs_set_log_cursor(sbp[0], nilfs);
+				err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+			}
 			up_write(&nilfs->ns_sem);
 		}
 	}
@@ -2447,9 +2453,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 	list_for_each_entry_safe(ii, n, head, i_dirty) {
 		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
 			continue;
-		hlist_del_init(&ii->vfs_inode.i_hash);
 		list_del_init(&ii->i_dirty);
-		nilfs_clear_gcinode(&ii->vfs_inode);
+		iput(&ii->vfs_inode);
 	}
 }
 
@@ -2467,13 +2472,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 
 	nilfs_transaction_lock(sbi, &ti, 1);
 
-	err = nilfs_init_gcdat_inode(nilfs);
+	err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
 	if (unlikely(err))
 		goto out_unlock;
 
 	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
-	if (unlikely(err))
+	if (unlikely(err)) {
+		nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
 		goto out_unlock;
+	}
 
 	sci->sc_freesegs = kbufs[4];
 	sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2505,7 +2512,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
  out_unlock:
 	sci->sc_freesegs = NULL;
 	sci->sc_nfreesegs = 0;
-	nilfs_clear_gcdat_inode(nilfs);
+	nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
 	nilfs_transaction_unlock(sbi);
 	return err;
 }
@@ -2667,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
 }
 
 static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+	__acquires(&sci->sc_state_lock)
+	__releases(&sci->sc_state_lock)
 {
 	sci->sc_state |= NILFS_SEGCTOR_QUIT;
 
@@ -2681,7 +2690,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
  * Setup & clean-up functions
  */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+					       struct nilfs_root *root)
 {
 	struct nilfs_sc_info *sci;
 
@@ -2692,6 +2702,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
 	sci->sc_sbi = sbi;
 	sci->sc_super = sbi->s_super;
 
+	nilfs_get_root(root);
+	sci->sc_root = root;
+
 	init_waitqueue_head(&sci->sc_wait_request);
 	init_waitqueue_head(&sci->sc_wait_daemon);
 	init_waitqueue_head(&sci->sc_wait_task);
@@ -2766,6 +2779,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 	WARN_ON(!list_empty(&sci->sc_segbufs));
 	WARN_ON(!list_empty(&sci->sc_write_logs));
 
+	nilfs_put_root(sci->sc_root);
+
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
 	del_timer_sync(&sci->sc_timer);
@@ -2775,6 +2790,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 /**
  * nilfs_attach_segment_constructor - attach a segment constructor
  * @sbi: nilfs_sb_info
+ * @root: root object of the current filesystem tree
  *
  * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
  * initializes it, and starts the segment constructor.
@@ -2784,9 +2800,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+				     struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 
 	if (NILFS_SC(sbi)) {
@@ -2798,14 +2814,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
 		nilfs_detach_segment_constructor(sbi);
 	}
 
-	sbi->s_sc_info = nilfs_segctor_new(sbi);
+	sbi->s_sc_info = nilfs_segctor_new(sbi, root);
 	if (!sbi->s_sc_info)
 		return -ENOMEM;
 
-	nilfs_attach_writer(nilfs, sbi);
 	err = nilfs_segctor_start_thread(NILFS_SC(sbi));
 	if (err) {
-		nilfs_detach_writer(nilfs, sbi);
 		kfree(sbi->s_sc_info);
 		sbi->s_sc_info = NULL;
 	}
@@ -2842,5 +2856,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
 	up_write(&nilfs->ns_segctor_sem);
 
 	nilfs_dispose_list(sbi, &garbage_list, 1);
-	nilfs_detach_writer(nilfs, sbi);
 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 01e20dbb217..cd8056e7cbe 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
 #include <linux/nilfs2_fs.h>
 #include "sb.h"
 
+struct nilfs_root;
+
 /**
  * struct nilfs_recovery_info - Recovery information
  * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
  * struct nilfs_sc_info - Segment constructor information
  * @sc_super: Back pointer to super_block struct
  * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_root: root object of the current filesystem tree
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
  * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
  * @sc_datablk_cnt: Data block count of a file
  * @sc_nblk_this_inc: Number of blocks included in the current logical segment
  * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
  * @sc_flags: Internal flags
  * @sc_state_lock: spinlock for sc_state and so on
  * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
 struct nilfs_sc_info {
 	struct super_block     *sc_super;
 	struct nilfs_sb_info   *sc_sbi;
+	struct nilfs_root      *sc_root;
 
 	unsigned long		sc_nblk_inc;
 
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_datablk_cnt;
 	unsigned long		sc_nblk_this_inc;
 	time_t			sc_seg_ctime;
-
+	__u64			sc_cno;
 	unsigned long		sc_flags;
 
 	spinlock_t		sc_state_lock;
@@ -230,17 +235,18 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
 				void **);
 
-extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+				     struct nilfs_root *root);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 
 /* recovery.c */
-extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
 				       struct buffer_head **, int);
-extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+extern int nilfs_search_super_root(struct the_nilfs *,
 				   struct nilfs_recovery_info *);
-extern int nilfs_recover_logical_segments(struct the_nilfs *,
-					  struct nilfs_sb_info *,
-					  struct nilfs_recovery_info *);
+extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
+				     struct nilfs_sb_info *,
+				     struct nilfs_recovery_info *);
 extern void nilfs_dispose_segment_list(struct list_head *);
 
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2..1d6f488ccae 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
-	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
 	void *kaddr;
 	int ret;
 
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 	struct nilfs_segment_usage *su;
 	struct nilfs_suinfo *si = buf;
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
 	void *kaddr;
 	unsigned long nsegs, segusages_per_block;
 	ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 }
 
 /**
- * nilfs_sufile_read - read sufile inode
- * @sufile: sufile inode
+ * nilfs_sufile_read - read or get sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
  * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
  */
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
-	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	struct inode *sufile;
+	struct nilfs_sufile_info *sui;
 	struct buffer_head *header_bh;
 	struct nilfs_sufile_header *header;
 	void *kaddr;
-	int ret;
+	int err;
 
-	ret = nilfs_read_inode_common(sufile, raw_inode);
-	if (ret < 0)
-		return ret;
+	sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
+	if (unlikely(!sufile))
+		return -ENOMEM;
+	if (!(sufile->i_state & I_NEW))
+		goto out;
 
-	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-	if (!ret) {
-		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-		header = kaddr + bh_offset(header_bh);
-		sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-		kunmap_atomic(kaddr, KM_USER0);
-		brelse(header_bh);
-	}
-	return ret;
-}
+	err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
+	if (err)
+		goto failed;
 
-/**
- * nilfs_sufile_new - create sufile
- * @nilfs: nilfs object
- * @susize: size of a segment usage entry
- */
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
-{
-	struct inode *sufile;
+	nilfs_mdt_set_entry_size(sufile, susize,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_read_inode_common(sufile, raw_inode);
+	if (err)
+		goto failed;
+
+	err = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (err)
+		goto failed;
 
-	sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
-			       sizeof(struct nilfs_sufile_info));
-	if (sufile)
-		nilfs_mdt_set_entry_size(sufile, susize,
-					 sizeof(struct nilfs_sufile_header));
-	return sufile;
+	sui = NILFS_SUI(sufile);
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+	unlock_new_inode(sufile);
+ out:
+	*inodep = sufile;
+	return 0;
+ failed:
+	iget_failed(sufile);
+	return err;
 }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7..a943fbacb45 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
 
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+	return NILFS_I_NILFS(sufile)->ns_nsegments;
 }
 
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+		      struct nilfs_inode *raw_inode, struct inode **inodep);
 
 /**
  * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 414ef68931c..f804d41ec9d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,16 +45,17 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/kobject.h>
-#include <linux/exportfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
+#include "export.h"
 #include "mdt.h"
 #include "alloc.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "cpfile.h"
 #include "ifile.h"
@@ -67,13 +68,33 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
-struct kmem_cache *nilfs_inode_cachep;
+static struct kmem_cache *nilfs_inode_cachep;
 struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
 
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
+static void nilfs_set_error(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp;
+
+	down_write(&nilfs->ns_sem);
+	if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+		nilfs->ns_mount_state |= NILFS_ERROR_FS;
+		sbp = nilfs_prepare_super(sbi, 0);
+		if (likely(sbp)) {
+			sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			if (sbp[1])
+				sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+		}
+	}
+	up_write(&nilfs->ns_sem);
+}
+
 /**
  * nilfs_error() - report failure condition on a filesystem
  *
@@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
 	va_end(args);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		struct the_nilfs *nilfs = sbi->s_nilfs;
-
-		down_write(&nilfs->ns_sem);
-		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
-			nilfs->ns_mount_state |= NILFS_ERROR_FS;
-			nilfs->ns_sbp[0]->s_state |=
-				cpu_to_le16(NILFS_ERROR_FS);
-			nilfs_commit_super(sbi, 1);
-		}
-		up_write(&nilfs->ns_sem);
+		nilfs_set_error(sbi);
 
 		if (nilfs_test_opt(sbi, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -135,7 +147,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 }
 
 
-struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
+struct inode *nilfs_alloc_inode(struct super_block *sb)
 {
 	struct nilfs_inode_info *ii;
 
@@ -144,70 +156,55 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 		return NULL;
 	ii->i_bh = NULL;
 	ii->i_state = 0;
+	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
+	nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
 	return &ii->vfs_inode;
 }
 
-struct inode *nilfs_alloc_inode(struct super_block *sb)
-{
-	return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
-}
-
 void nilfs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
-}
-
-static void nilfs_clear_inode(struct inode *inode)
-{
-	struct nilfs_inode_info *ii = NILFS_I(inode);
-
-	/*
-	 * Free resources allocated in nilfs_read_inode(), here.
-	 */
-	BUG_ON(!list_empty(&ii->i_dirty));
-	brelse(ii->i_bh);
-	ii->i_bh = NULL;
-
-	if (test_bit(NILFS_I_BMAP, &ii->i_state))
-		nilfs_bmap_clear(ii->i_bmap);
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
-	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+	if (mdi) {
+		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+		kfree(mdi);
+	}
+	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
-	int barrier_done = 0;
 
-	if (nilfs_test_opt(sbi, BARRIER)) {
-		set_buffer_ordered(nilfs->ns_sbh[0]);
-		barrier_done = 1;
-	}
  retry:
 	set_buffer_dirty(nilfs->ns_sbh[0]);
-	err = sync_dirty_buffer(nilfs->ns_sbh[0]);
-	if (err == -EOPNOTSUPP && barrier_done) {
-		nilfs_warning(sbi->s_super, __func__,
-			      "barrier-based sync failed. "
-			      "disabling barriers\n");
-		nilfs_clear_opt(sbi, BARRIER);
-		barrier_done = 0;
-		clear_buffer_ordered(nilfs->ns_sbh[0]);
-		goto retry;
+	if (nilfs_test_opt(sbi, BARRIER)) {
+		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
+					  WRITE_SYNC | WRITE_FLUSH_FUA);
+	} else {
+		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
+
 	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: unable to write superblock (err=%d)\n", err);
 		if (err == -EIO && nilfs->ns_sbh[1]) {
+			/*
+			 * sbp[0] points to newer log than sbp[1],
+			 * so copy sbp[0] to sbp[1] to take over sbp[0].
+			 */
+			memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
+			       nilfs->ns_sbsize);
 			nilfs_fall_back_super_block(nilfs);
 			goto retry;
 		}
 	} else {
 		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
 
+		nilfs->ns_sbwcount++;
+
 		/*
 		 * The latest segment becomes trailable from the position
 		 * written in superblock.
@@ -216,66 +213,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
 
 		/* update GC protection for recent segments */
 		if (nilfs->ns_sbh[1]) {
-			sbp = NULL;
-			if (dupsb) {
+			if (flag == NILFS_SB_COMMIT_ALL) {
 				set_buffer_dirty(nilfs->ns_sbh[1]);
-				if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
-					sbp = nilfs->ns_sbp[1];
+				if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
+					goto out;
 			}
+			if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
+			    le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
+				sbp = nilfs->ns_sbp[1];
 		}
-		if (sbp) {
-			spin_lock(&nilfs->ns_last_segment_lock);
-			nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
-			spin_unlock(&nilfs->ns_last_segment_lock);
-		}
-	}
 
+		spin_lock(&nilfs->ns_last_segment_lock);
+		nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+		spin_unlock(&nilfs->ns_last_segment_lock);
+	}
+ out:
 	return err;
 }
 
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
+			  struct the_nilfs *nilfs)
+{
+	sector_t nfreeblocks;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+					       int flip)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
-	sector_t nfreeblocks;
-	time_t t;
-	int err;
 
-	/* nilfs->sem must be locked by the caller. */
+	/* nilfs->ns_sem must be locked by the caller. */
 	if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-		if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
-			nilfs_swap_super_block(nilfs);
-		else {
+		if (sbp[1] &&
+		    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		} else {
 			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
 			       sbi->s_super->s_id);
-			return -EIO;
+			return NULL;
 		}
+	} else if (sbp[1] &&
+		   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+			memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 	}
-	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
-	if (unlikely(err)) {
-		printk(KERN_ERR "NILFS: failed to count free blocks\n");
-		return err;
-	}
-	spin_lock(&nilfs->ns_last_segment_lock);
-	sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
-	sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
-	sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
-	spin_unlock(&nilfs->ns_last_segment_lock);
 
+	if (flip && sbp[1])
+		nilfs_swap_super_block(nilfs);
+
+	return sbp;
+}
+
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	time_t t;
+
+	/* nilfs->ns_sem must be locked by the caller. */
 	t = get_seconds();
-	nilfs->ns_sbwtime[0] = t;
-	sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+	nilfs->ns_sbwtime = t;
 	sbp[0]->s_wtime = cpu_to_le64(t);
 	sbp[0]->s_sum = 0;
 	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
 					     (unsigned char *)sbp[0],
 					     nilfs->ns_sbsize));
-	if (dupsb && sbp[1]) {
-		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
-		nilfs->ns_sbwtime[1] = t;
+	if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
+		sbp[1]->s_wtime = sbp[0]->s_wtime;
+		sbp[1]->s_sum = 0;
+		sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					    (unsigned char *)sbp[1],
+					    nilfs->ns_sbsize));
 	}
 	clear_nilfs_sb_dirty(nilfs);
-	return nilfs_sync_super(sbi, dupsb);
+	return nilfs_sync_super(sbi, flag);
+}
+
+/**
+ * nilfs_cleanup_super() - write filesystem state for cleanup
+ * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ *
+ * This function restores state flags in the on-disk super block.
+ * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
+ * filesystem was not clean previously.
+ */
+int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+{
+	struct nilfs_super_block **sbp;
+	int flag = NILFS_SB_COMMIT;
+	int ret = -EIO;
+
+	sbp = nilfs_prepare_super(sbi, 0);
+	if (sbp) {
+		sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
+		nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+		if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
+			/*
+			 * make the "clean" flag also to the opposite
+			 * super block if both super blocks point to
+			 * the same checkpoint.
+			 */
+			sbp[1]->s_state = sbp[0]->s_state;
+			flag = NILFS_SB_COMMIT_ALL;
+		}
+		ret = nilfs_commit_super(sbi, flag);
+	}
+	return ret;
 }
 
 static void nilfs_put_super(struct super_block *sb)
@@ -283,34 +336,29 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	lock_kernel();
-
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
-		nilfs_commit_super(sbi, 1);
+		nilfs_cleanup_super(sbi);
 		up_write(&nilfs->ns_sem);
 	}
-	down_write(&nilfs->ns_super_sem);
-	if (nilfs->ns_current == sbi)
-		nilfs->ns_current = NULL;
-	up_write(&nilfs->ns_super_sem);
 
-	nilfs_detach_checkpoint(sbi);
-	put_nilfs(sbi->s_nilfs);
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+	destroy_nilfs(nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
-
-	unlock_kernel();
+	kfree(sbi);
 }
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp;
 	int err = 0;
 
 	/* This function is called when super block should be written back */
@@ -318,27 +366,34 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 		err = nilfs_construct_segment(sb);
 
 	down_write(&nilfs->ns_sem);
-	if (nilfs_sb_dirty(nilfs))
-		nilfs_commit_super(sbi, 1);
+	if (nilfs_sb_dirty(nilfs)) {
+		sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+		if (likely(sbp)) {
+			nilfs_set_log_cursor(sbp[0], nilfs);
+			nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+		}
+	}
 	up_write(&nilfs->ns_sem);
 
 	return err;
 }
 
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+			    struct nilfs_root **rootp)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_root *root;
 	struct nilfs_checkpoint *raw_cp;
 	struct buffer_head *bh_cp;
-	int err;
+	int err = -ENOMEM;
 
-	down_write(&nilfs->ns_super_sem);
-	list_add(&sbi->s_list, &nilfs->ns_supers);
-	up_write(&nilfs->ns_super_sem);
+	root = nilfs_find_or_create_root(
+		nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
+	if (!root)
+		return err;
 
-	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
-	if (!sbi->s_ifile)
-		return -ENOMEM;
+	if (root->ifile)
+		goto reuse; /* already attached checkpoint */
 
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -354,44 +409,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 		}
 		goto failed;
 	}
-	err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
-	if (unlikely(err))
+
+	err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+			       &raw_cp->cp_ifile_inode, &root->ifile);
+	if (err)
 		goto failed_bh;
-	atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+
+	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+
+ reuse:
+	*rootp = root;
 	return 0;
 
  failed_bh:
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
  failed:
-	nilfs_mdt_destroy(sbi->s_ifile);
-	sbi->s_ifile = NULL;
+	nilfs_put_root(root);
 
-	down_write(&nilfs->ns_super_sem);
-	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
+	return err;
+}
 
+static int nilfs_freeze(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Mark super block clean */
+	down_write(&nilfs->ns_sem);
+	err = nilfs_cleanup_super(sbi);
+	up_write(&nilfs->ns_sem);
 	return err;
 }
 
-void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+static int nilfs_unfreeze(struct super_block *sb)
 {
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_mdt_destroy(sbi->s_ifile);
-	sbi->s_ifile = NULL;
-	down_write(&nilfs->ns_super_sem);
-	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_super_sem);
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_write(&nilfs->ns_sem);
+	nilfs_setup_super(sbi, false);
+	up_write(&nilfs->ns_sem);
+	return 0;
 }
 
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
+	struct the_nilfs *nilfs = root->nilfs;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 	unsigned long long blocks;
 	unsigned long overhead;
@@ -427,7 +502,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bfree = nfreeblocks;
 	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
 		(buf->f_bfree - nrsvblocks) : 0;
-	buf->f_files = atomic_read(&sbi->s_inodes_count);
+	buf->f_files = atomic_read(&root->inodes_count);
 	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
 	buf->f_namelen = NILFS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
@@ -440,22 +515,22 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct super_block *sb = vfs->mnt_sb;
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
 
 	if (!nilfs_test_opt(sbi, BARRIER))
-		seq_printf(seq, ",nobarrier");
-	if (nilfs_test_opt(sbi, SNAPSHOT))
-		seq_printf(seq, ",cp=%llu",
-			   (unsigned long long int)sbi->s_snapshot_cno);
+		seq_puts(seq, ",nobarrier");
+	if (root->cno != NILFS_CPTREE_CURRENT_CNO)
+		seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
 	if (nilfs_test_opt(sbi, ERRORS_PANIC))
-		seq_printf(seq, ",errors=panic");
+		seq_puts(seq, ",errors=panic");
 	if (nilfs_test_opt(sbi, ERRORS_CONT))
-		seq_printf(seq, ",errors=continue");
+		seq_puts(seq, ",errors=continue");
 	if (nilfs_test_opt(sbi, STRICT_ORDER))
-		seq_printf(seq, ",order=strict");
+		seq_puts(seq, ",order=strict");
 	if (nilfs_test_opt(sbi, NORECOVERY))
-		seq_printf(seq, ",norecovery");
+		seq_puts(seq, ",norecovery");
 	if (nilfs_test_opt(sbi, DISCARD))
-		seq_printf(seq, ",discard");
+		seq_puts(seq, ",discard");
 
 	return 0;
 }
@@ -467,85 +542,45 @@ static const struct super_operations nilfs_sops = {
 	/* .write_inode    = nilfs_write_inode, */
 	/* .put_inode      = nilfs_put_inode, */
 	/* .drop_inode	  = nilfs_drop_inode, */
-	.delete_inode   = nilfs_delete_inode,
+	.evict_inode    = nilfs_evict_inode,
 	.put_super      = nilfs_put_super,
 	/* .write_super    = nilfs_write_super, */
 	.sync_fs        = nilfs_sync_fs,
+	.freeze_fs	= nilfs_freeze,
+	.unfreeze_fs	= nilfs_unfreeze,
 	/* .write_super_lockfs */
 	/* .unlockfs */
 	.statfs         = nilfs_statfs,
 	.remount_fs     = nilfs_remount,
-	.clear_inode    = nilfs_clear_inode,
 	/* .umount_begin */
 	.show_options = nilfs_show_options
 };
 
-static struct inode *
-nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
-{
-	struct inode *inode;
-
-	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
-	    ino != NILFS_SKETCH_INO)
-		return ERR_PTR(-ESTALE);
-
-	inode = nilfs_iget(sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	if (generation && inode->i_generation != generation) {
-		iput(inode);
-		return ERR_PTR(-ESTALE);
-	}
-
-	return inode;
-}
-
-static struct dentry *
-nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-		   int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    nilfs_nfs_get_inode);
-}
-
-static struct dentry *
-nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
-		   int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    nilfs_nfs_get_inode);
-}
-
-static const struct export_operations nilfs_export_ops = {
-	.fh_to_dentry = nilfs_fh_to_dentry,
-	.fh_to_parent = nilfs_fh_to_parent,
-	.get_parent = nilfs_get_parent,
-};
-
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-	Opt_discard, Opt_err,
+	Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+	Opt_discard, Opt_nodiscard, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
 	{Opt_norecovery, "norecovery"},
 	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, struct super_block *sb)
+static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
-	int option;
 
 	if (!options)
 		return 1;
@@ -557,6 +592,9 @@ static int parse_options(char *options, struct super_block *sb)
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_barrier:
+			nilfs_set_opt(sbi, BARRIER);
+			break;
 		case Opt_nobarrier:
 			nilfs_clear_opt(sbi, BARRIER);
 			break;
@@ -580,12 +618,12 @@ static int parse_options(char *options, struct super_block *sb)
 			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
 			break;
 		case Opt_snapshot:
-			if (match_int(&args[0], &option) || option <= 0)
-				return 0;
-			if (!(sb->s_flags & MS_RDONLY))
+			if (is_remount) {
+				printk(KERN_ERR
+				       "NILFS: \"%s\" option is invalid "
+				       "for remount.\n", p);
 				return 0;
-			sbi->s_snapshot_cno = option;
-			nilfs_set_opt(sbi, SNAPSHOT);
+			}
 			break;
 		case Opt_norecovery:
 			nilfs_set_opt(sbi, NORECOVERY);
@@ -593,6 +631,9 @@ static int parse_options(char *options, struct super_block *sb)
 		case Opt_discard:
 			nilfs_set_opt(sbi, DISCARD);
 			break;
+		case Opt_nodiscard:
+			nilfs_clear_opt(sbi, DISCARD);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -610,14 +651,24 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
-static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
-	struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
-	int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
-	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+	struct nilfs_super_block **sbp;
+	int max_mnt_count;
+	int mnt_count;
+
+	/* nilfs->ns_sem must be locked by the caller. */
+	sbp = nilfs_prepare_super(sbi, 0);
+	if (!sbp)
+		return -EIO;
+
+	if (!is_mount)
+		goto skip_mount_setup;
+
+	max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
+	mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
 
-	/* nilfs->sem must be locked by the caller. */
 	if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
 		printk(KERN_WARNING
 		       "NILFS warning: mounting fs with errors\n");
@@ -628,12 +679,17 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 #endif
 	}
 	if (!max_mnt_count)
-		sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+		sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+
+	sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
 
-	sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
-	sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
-	sbp->s_mtime = cpu_to_le64(get_seconds());
-	return nilfs_commit_super(sbi, 1);
+skip_mount_setup:
+	sbp[0]->s_state =
+		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
+	/* synchronize sbp[1] with sbp[0] */
+	memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
 
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -670,7 +726,165 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 	sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
 	sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
 
-	return !parse_options(data, sb) ? -EINVAL : 0 ;
+	return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+}
+
+int nilfs_check_feature_compatibility(struct super_block *sb,
+				      struct nilfs_super_block *sbp)
+{
+	__u64 features;
+
+	features = le64_to_cpu(sbp->s_feature_incompat) &
+		~NILFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	features = le64_to_cpu(sbp->s_feature_compat_ro) &
+		~NILFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
+		       "unsupported optional features (%llx)\n",
+		       (unsigned long long)features);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int nilfs_get_root_dentry(struct super_block *sb,
+				 struct nilfs_root *root,
+				 struct dentry **root_dentry)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret = 0;
+
+	inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+	if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+		iput(inode);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+		dentry = d_find_alias(inode);
+		if (!dentry) {
+			dentry = d_alloc_root(inode);
+			if (!dentry) {
+				iput(inode);
+				ret = -ENOMEM;
+				goto failed_dentry;
+			}
+		} else {
+			iput(inode);
+		}
+	} else {
+		dentry = d_obtain_alias(inode);
+		if (IS_ERR(dentry)) {
+			ret = PTR_ERR(dentry);
+			goto failed_dentry;
+		}
+	}
+	*root_dentry = dentry;
+ out:
+	return ret;
+
+ failed_dentry:
+	printk(KERN_ERR "NILFS: get root dentry failed\n");
+	goto out;
+}
+
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+				 struct dentry **root_dentry)
+{
+	struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+	struct nilfs_root *root;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0) {
+		ret = (ret == -ENOENT) ? -EINVAL : ret;
+		goto out;
+	} else if (!ret) {
+		printk(KERN_ERR "NILFS: The specified checkpoint is "
+		       "not a snapshot (checkpoint number=%llu).\n",
+		       (unsigned long long)cno);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+	if (ret) {
+		printk(KERN_ERR "NILFS: error loading snapshot "
+		       "(checkpoint number=%llu).\n",
+	       (unsigned long long)cno);
+		goto out;
+	}
+	ret = nilfs_get_root_dentry(s, root, root_dentry);
+	nilfs_put_root(root);
+ out:
+	return ret;
+}
+
+static int nilfs_tree_was_touched(struct dentry *root_dentry)
+{
+	return atomic_read(&root_dentry->d_count) > 1;
+}
+
+/**
+ * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+{
+	if (have_submounts(root_dentry))
+		return true;
+	shrink_dcache_parent(root_dentry);
+	return nilfs_tree_was_touched(root_dentry);
+}
+
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct nilfs_root *root;
+	struct inode *inode;
+	struct dentry *dentry;
+	int ret;
+
+	if (cno < 0 || cno > nilfs->ns_cno)
+		return false;
+
+	if (cno >= nilfs_last_cno(nilfs))
+		return true;	/* protect recent checkpoints */
+
+	ret = false;
+	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	if (root) {
+		inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+		if (inode) {
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				if (nilfs_tree_was_touched(dentry))
+					ret = nilfs_try_to_shrink_tree(dentry);
+				dput(dentry);
+			}
+			iput(inode);
+		}
+		nilfs_put_root(root);
+	}
+	return ret;
 }
 
 /**
@@ -678,17 +892,17 @@ int nilfs_store_magic_and_option(struct super_block *sb,
  * @sb: super_block
  * @data: mount options
  * @silent: silent mode flag
- * @nilfs: the_nilfs struct
  *
  * This function is called exclusively by nilfs->ns_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
  */
 static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent,
-		 struct the_nilfs *nilfs)
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
+	struct the_nilfs *nilfs;
 	struct nilfs_sb_info *sbi;
-	struct inode *root;
+	struct nilfs_root *fsroot;
+	struct backing_dev_info *bdi;
 	__u64 cno;
 	int err;
 
@@ -697,19 +911,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->s_super = sb;
 
-	get_nilfs(nilfs);
+	nilfs = alloc_nilfs(sb->s_bdev);
+	if (!nilfs) {
+		err = -ENOMEM;
+		goto failed_sbi;
+	}
 	sbi->s_nilfs = nilfs;
-	sbi->s_super = sb;
-	atomic_set(&sbi->s_count, 1);
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
-		goto failed_sbi;
+		goto failed_nilfs;
 
 	spin_lock_init(&sbi->s_inode_lock);
 	INIT_LIST_HEAD(&sbi->s_dirty_files);
-	INIT_LIST_HEAD(&sbi->s_list);
 
 	/*
 	 * Following initialization is overlapped because
@@ -725,141 +941,80 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_export_op = &nilfs_export_ops;
 	sb->s_root = NULL;
 	sb->s_time_gran = 1;
-	sb->s_bdi = nilfs->ns_bdi;
+
+	bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	sb->s_bdi = bdi ? : &default_backing_dev_info;
 
 	err = load_nilfs(nilfs, sbi);
 	if (err)
-		goto failed_sbi;
+		goto failed_nilfs;
 
 	cno = nilfs_last_cno(nilfs);
-
-	if (sb->s_flags & MS_RDONLY) {
-		if (nilfs_test_opt(sbi, SNAPSHOT)) {
-			down_read(&nilfs->ns_segctor_sem);
-			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
-						       sbi->s_snapshot_cno);
-			up_read(&nilfs->ns_segctor_sem);
-			if (err < 0) {
-				if (err == -ENOENT)
-					err = -EINVAL;
-				goto failed_sbi;
-			}
-			if (!err) {
-				printk(KERN_ERR
-				       "NILFS: The specified checkpoint is "
-				       "not a snapshot "
-				       "(checkpoint number=%llu).\n",
-				       (unsigned long long)sbi->s_snapshot_cno);
-				err = -EINVAL;
-				goto failed_sbi;
-			}
-			cno = sbi->s_snapshot_cno;
-		}
-	}
-
-	err = nilfs_attach_checkpoint(sbi, cno);
+	err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
 	if (err) {
-		printk(KERN_ERR "NILFS: error loading a checkpoint"
-		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
-		goto failed_sbi;
+		printk(KERN_ERR "NILFS: error loading last checkpoint "
+		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
+		goto failed_unload;
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		err = nilfs_attach_segment_constructor(sbi);
+		err = nilfs_attach_segment_constructor(sbi, fsroot);
 		if (err)
 			goto failed_checkpoint;
 	}
 
-	root = nilfs_iget(sb, NILFS_ROOT_INO);
-	if (IS_ERR(root)) {
-		printk(KERN_ERR "NILFS: get root inode failed\n");
-		err = PTR_ERR(root);
-		goto failed_segctor;
-	}
-	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-		iput(root);
-		printk(KERN_ERR "NILFS: corrupt root inode.\n");
-		err = -EINVAL;
-		goto failed_segctor;
-	}
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
-		printk(KERN_ERR "NILFS: get root dentry failed\n");
-		err = -ENOMEM;
+	err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
+	if (err)
 		goto failed_segctor;
-	}
+
+	nilfs_put_root(fsroot);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi);
+		nilfs_setup_super(sbi, true);
 		up_write(&nilfs->ns_sem);
 	}
 
-	down_write(&nilfs->ns_super_sem);
-	if (!nilfs_test_opt(sbi, SNAPSHOT))
-		nilfs->ns_current = sbi;
-	up_write(&nilfs->ns_super_sem);
-
 	return 0;
 
  failed_segctor:
 	nilfs_detach_segment_constructor(sbi);
 
  failed_checkpoint:
-	nilfs_detach_checkpoint(sbi);
+	nilfs_put_root(fsroot);
+
+ failed_unload:
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_dat);
+
+ failed_nilfs:
+	destroy_nilfs(nilfs);
 
  failed_sbi:
-	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
-	nilfs_put_sbinfo(sbi);
+	kfree(sbi);
 	return err;
 }
 
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct nilfs_super_block *sbp;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	unsigned long old_sb_flags;
 	struct nilfs_mount_options old_opts;
-	int was_snapshot, err;
-
-	lock_kernel();
+	int err;
 
-	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
-	old_opts.snapshot_cno = sbi->s_snapshot_cno;
-	was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
 
-	if (!parse_options(data, sb)) {
+	if (!parse_options(data, sb, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
 
 	err = -EINVAL;
-	if (was_snapshot) {
-		if (!(*flags & MS_RDONLY)) {
-			printk(KERN_ERR "NILFS (device %s): cannot remount "
-			       "snapshot read/write.\n",
-			       sb->s_id);
-			goto restore_opts;
-		} else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
-			printk(KERN_ERR "NILFS (device %s): cannot "
-			       "remount to a different snapshot.\n",
-			       sb->s_id);
-			goto restore_opts;
-		}
-	} else {
-		if (nilfs_test_opt(sbi, SNAPSHOT)) {
-			printk(KERN_ERR "NILFS (device %s): cannot change "
-			       "a regular mount to a snapshot.\n",
-			       sb->s_id);
-			goto restore_opts;
-		}
-	}
 
 	if (!nilfs_valid_fs(nilfs)) {
 		printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -880,40 +1035,47 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * the RDONLY flag and then mark the partition as valid again.
 		 */
 		down_write(&nilfs->ns_sem);
-		sbp = nilfs->ns_sbp[0];
-		if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
-		    (nilfs->ns_mount_state & NILFS_VALID_FS))
-			sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
-		sbp->s_mtime = cpu_to_le64(get_seconds());
-		nilfs_commit_super(sbi, 1);
+		nilfs_cleanup_super(sbi);
 		up_write(&nilfs->ns_sem);
 	} else {
+		__u64 features;
+		struct nilfs_root *root;
+
 		/*
 		 * Mounting a RDONLY partition read-write, so reread and
 		 * store the current valid flag.  (It may have been changed
 		 * by fsck since we originally mounted the partition.)
 		 */
+		down_read(&nilfs->ns_sem);
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		up_read(&nilfs->ns_sem);
+		if (features) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount RDWR because of unsupported optional "
+			       "features (%llx)\n",
+			       sb->s_id, (unsigned long long)features);
+			err = -EROFS;
+			goto restore_opts;
+		}
+
 		sb->s_flags &= ~MS_RDONLY;
 
-		err = nilfs_attach_segment_constructor(sbi);
+		root = NILFS_I(sb->s_root->d_inode)->i_root;
+		err = nilfs_attach_segment_constructor(sbi, root);
 		if (err)
 			goto restore_opts;
 
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi);
+		nilfs_setup_super(sbi, true);
 		up_write(&nilfs->ns_sem);
 	}
  out:
-	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return 0;
 
  restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
-	sbi->s_snapshot_cno = old_opts.snapshot_cno;
-	up_write(&nilfs->ns_super_sem);
-	unlock_kernel();
 	return err;
 }
 
@@ -933,7 +1095,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 {
 	char *p, *options = data;
 	substring_t args[MAX_OPT_ARGS];
-	int option, token;
+	int token;
 	int ret = 0;
 
 	do {
@@ -941,16 +1103,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 		if (p != NULL && *p) {
 			token = match_token(p, tokens, args);
 			if (token == Opt_snapshot) {
-				if (!(sd->flags & MS_RDONLY))
+				if (!(sd->flags & MS_RDONLY)) {
 					ret++;
-				else {
-					ret = match_int(&args[0], &option);
-					if (!ret) {
-						if (option > 0)
-							sd->cno = option;
-						else
-							ret++;
-					}
+				} else {
+					sd->cno = simple_strtoull(args[0].from,
+								  NULL, 0);
+					/*
+					 * No need to see the end pointer;
+					 * match_token() has done syntax
+					 * checking.
+					 */
+					if (sd->cno == 0)
+						ret++;
 				}
 			}
 			if (ret)
@@ -967,43 +1131,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 
 static int nilfs_set_bdev_super(struct super_block *s, void *data)
 {
-	struct nilfs_super_data *sd = data;
-
-	s->s_bdev = sd->bdev;
+	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	return 0;
 }
 
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
-	struct nilfs_super_data *sd = data;
-
-	return sd->sbi && s->s_fs_info == (void *)sd->sbi;
+	return (void *)s->s_bdev == data;
 }
 
-static int
-nilfs_get_sb(struct file_system_type *fs_type, int flags,
-	     const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+nilfs_mount(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data)
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
 	fmode_t mode = FMODE_READ;
-	struct the_nilfs *nilfs;
-	int err, need_to_close = 1;
+	struct dentry *root_dentry;
+	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
 	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
-		return PTR_ERR(sd.bdev);
+		return ERR_CAST(sd.bdev);
 
-	/*
-	 * To get mount instance using sget() vfs-routine, NILFS needs
-	 * much more information than normal filesystems to identify mount
-	 * instance.  For snapshot mounts, not only a mount type (ro-mount
-	 * or rw-mount) but also a checkpoint number is required.
-	 */
 	sd.cno = 0;
 	sd.flags = flags;
 	if (nilfs_identify((char *)data, &sd)) {
@@ -1011,101 +1165,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	nilfs = find_or_create_nilfs(sd.bdev);
-	if (!nilfs) {
-		err = -ENOMEM;
-		goto failed;
-	}
-
-	mutex_lock(&nilfs->ns_mount_mutex);
-
-	if (!sd.cno) {
-		/*
-		 * Check if an exclusive mount exists or not.
-		 * Snapshot mounts coexist with a current mount
-		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
-		 * ro-mount are mutually exclusive.
-		 */
-		down_read(&nilfs->ns_super_sem);
-		if (nilfs->ns_current &&
-		    ((nilfs->ns_current->s_super->s_flags ^ flags)
-		     & MS_RDONLY)) {
-			up_read(&nilfs->ns_super_sem);
-			err = -EBUSY;
-			goto failed_unlock;
-		}
-		up_read(&nilfs->ns_super_sem);
-	}
-
 	/*
-	 * Find existing nilfs_sb_info struct
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
 	 */
-	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-
-	/*
-	 * Get super block instance holding the nilfs_sb_info struct.
-	 * A new instance is allocated if no existing mount is present or
-	 * existing instance has been unmounted.
-	 */
-	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-	if (sd.sbi)
-		nilfs_put_sbinfo(sd.sbi);
-
+	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
+	if (sd.bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+		err = -EBUSY;
+		goto failed;
+	}
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+	mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
-		goto failed_unlock;
+		goto failed;
 	}
 
 	if (!s->s_root) {
 		char b[BDEVNAME_SIZE];
 
+		s_new = true;
+
 		/* New superblock instance created */
 		s->s_flags = flags;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
-				       nilfs);
+		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (err)
-			goto cancel_new;
+			goto failed_super;
 
 		s->s_flags |= MS_ACTIVE;
-		need_to_close = 0;
+	} else if (!sd.cno) {
+		int busy = false;
+
+		if (nilfs_tree_was_touched(s->s_root)) {
+			busy = nilfs_try_to_shrink_tree(s->s_root);
+			if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+				printk(KERN_ERR "NILFS: the device already "
+				       "has a %s mount.\n",
+				       (s->s_flags & MS_RDONLY) ?
+				       "read-only" : "read/write");
+				err = -EBUSY;
+				goto failed_super;
+			}
+		}
+		if (!busy) {
+			/*
+			 * Try remount to setup mount states if the current
+			 * tree is not mounted and only snapshots use this sb.
+			 */
+			err = nilfs_remount(s, &flags, data);
+			if (err)
+				goto failed_super;
+		}
 	}
 
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
-	if (need_to_close)
-		close_bdev_exclusive(sd.bdev, mode);
-	simple_set_mnt(mnt, s);
-	return 0;
+	if (sd.cno) {
+		err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+		if (err)
+			goto failed_super;
+	} else {
+		root_dentry = dget(s->s_root);
+	}
 
- failed_unlock:
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
- failed:
-	close_bdev_exclusive(sd.bdev, mode);
+	if (!s_new)
+		close_bdev_exclusive(sd.bdev, mode);
 
-	return err;
+	return root_dentry;
 
- cancel_new:
-	/* Abandoning the newly allocated superblock */
-	mutex_unlock(&nilfs->ns_mount_mutex);
-	put_nilfs(nilfs);
+ failed_super:
 	deactivate_locked_super(s);
-	/*
-	 * deactivate_locked_super() invokes close_bdev_exclusive().
-	 * We must finish all post-cleaning before this call;
-	 * put_nilfs() needs the block device.
-	 */
-	return err;
+
+ failed:
+	if (!s_new)
+		close_bdev_exclusive(sd.bdev, mode);
+	return ERR_PTR(err);
 }
 
 struct file_system_type nilfs_fs_type = {
 	.owner    = THIS_MODULE,
 	.name     = "nilfs2",
-	.get_sb   = nilfs_get_sb,
+	.mount    = nilfs_mount,
 	.kill_sb  = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
@@ -1119,7 +1263,7 @@ static void nilfs_inode_init_once(void *obj)
 	init_rwsem(&ii->xattr_sem);
 #endif
 	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-	ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8c1097327ab..0254be2d73c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,8 +35,7 @@
 #include "segbuf.h"
 
 
-static LIST_HEAD(nilfs_objects);
-static DEFINE_SPINLOCK(nilfs_lock);
+static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
 			    sector_t start_blocknr, u64 seq, __u64 cno)
@@ -45,20 +44,27 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 	nilfs->ns_last_pseg = start_blocknr;
 	nilfs->ns_last_seq = seq;
 	nilfs->ns_last_cno = cno;
+
+	if (!nilfs_sb_dirty(nilfs)) {
+		if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
+			goto stay_cursor;
+
+		set_nilfs_sb_dirty(nilfs);
+	}
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+
+ stay_cursor:
 	spin_unlock(&nilfs->ns_last_segment_lock);
 }
 
 /**
- * alloc_nilfs - allocate the_nilfs structure
+ * alloc_nilfs - allocate a nilfs object
  * @bdev: block device to which the_nilfs is related
  *
- * alloc_nilfs() allocates memory for the_nilfs and
- * initializes its reference count and locks.
- *
  * Return Value: On success, pointer to the_nilfs is returned.
  * On error, NULL is returned.
  */
-static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
 	struct the_nilfs *nilfs;
 
@@ -67,92 +73,25 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 		return NULL;
 
 	nilfs->ns_bdev = bdev;
-	atomic_set(&nilfs->ns_count, 1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
-	init_rwsem(&nilfs->ns_super_sem);
-	mutex_init(&nilfs->ns_mount_mutex);
-	init_rwsem(&nilfs->ns_writer_sem);
-	INIT_LIST_HEAD(&nilfs->ns_list);
-	INIT_LIST_HEAD(&nilfs->ns_supers);
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
-	nilfs->ns_gc_inodes_h = NULL;
+	nilfs->ns_cptree = RB_ROOT;
+	spin_lock_init(&nilfs->ns_cptree_lock);
 	init_rwsem(&nilfs->ns_segctor_sem);
 
 	return nilfs;
 }
 
 /**
- * find_or_create_nilfs - find or create nilfs object
- * @bdev: block device to which the_nilfs is related
- *
- * find_nilfs() looks up an existent nilfs object created on the
- * device and gets the reference count of the object.  If no nilfs object
- * is found on the device, a new nilfs object is allocated.
- *
- * Return Value: On success, pointer to the nilfs object is returned.
- * On error, NULL is returned.
+ * destroy_nilfs - destroy nilfs object
+ * @nilfs: nilfs object to be released
  */
-struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+void destroy_nilfs(struct the_nilfs *nilfs)
 {
-	struct the_nilfs *nilfs, *new = NULL;
-
- retry:
-	spin_lock(&nilfs_lock);
-	list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
-		if (nilfs->ns_bdev == bdev) {
-			get_nilfs(nilfs);
-			spin_unlock(&nilfs_lock);
-			if (new)
-				put_nilfs(new);
-			return nilfs; /* existing object */
-		}
-	}
-	if (new) {
-		list_add_tail(&new->ns_list, &nilfs_objects);
-		spin_unlock(&nilfs_lock);
-		return new; /* new object */
-	}
-	spin_unlock(&nilfs_lock);
-
-	new = alloc_nilfs(bdev);
-	if (new)
-		goto retry;
-	return NULL; /* insufficient memory */
-}
-
-/**
- * put_nilfs - release a reference to the_nilfs
- * @nilfs: the_nilfs structure to be released
- *
- * put_nilfs() decrements a reference counter of the_nilfs.
- * If the reference count reaches zero, the_nilfs is freed.
- */
-void put_nilfs(struct the_nilfs *nilfs)
-{
-	spin_lock(&nilfs_lock);
-	if (!atomic_dec_and_test(&nilfs->ns_count)) {
-		spin_unlock(&nilfs_lock);
-		return;
-	}
-	list_del_init(&nilfs->ns_list);
-	spin_unlock(&nilfs_lock);
-
-	/*
-	 * Increment of ns_count never occurs below because the caller
-	 * of get_nilfs() holds at least one reference to the_nilfs.
-	 * Thus its exclusion control is not required here.
-	 */
-
 	might_sleep();
-	if (nilfs_loaded(nilfs)) {
-		nilfs_mdt_destroy(nilfs->ns_sufile);
-		nilfs_mdt_destroy(nilfs->ns_cpfile);
-		nilfs_mdt_destroy(nilfs->ns_dat);
-		nilfs_mdt_destroy(nilfs->ns_gc_dat);
-	}
 	if (nilfs_init(nilfs)) {
-		nilfs_destroy_gccache(nilfs);
 		brelse(nilfs->ns_sbh[0]);
 		brelse(nilfs->ns_sbh[1]);
 	}
@@ -160,16 +99,17 @@ void put_nilfs(struct the_nilfs *nilfs)
 }
 
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
-				 struct nilfs_sb_info *sbi, sector_t sr_block)
+				 struct super_block *sb, sector_t sr_block)
 {
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct nilfs_inode *rawi;
 	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
 	unsigned inode_size;
 	int err;
 
-	err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+	err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
 	if (unlikely(err))
 		return err;
 
@@ -181,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 
 	inode_size = nilfs->ns_inode_size;
 
-	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
-	if (unlikely(!nilfs->ns_dat))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
+	err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
+	if (err)
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
-	if (unlikely(!nilfs->ns_gc_dat))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
+	err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+	if (err)
 		goto failed_dat;
 
-	nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
-	if (unlikely(!nilfs->ns_cpfile))
-		goto failed_gc_dat;
-
-	nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
-	if (unlikely(!nilfs->ns_sufile))
+	rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
+	err = nilfs_sufile_read(sb, segment_usage_size, rawi,
+				&nilfs->ns_sufile);
+	if (err)
 		goto failed_cpfile;
 
-	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-
-	err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-			     NILFS_SR_DAT_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-				NILFS_SR_CPFILE_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-				NILFS_SR_SUFILE_OFFSET(inode_size));
-	if (unlikely(err))
-		goto failed_sufile;
-
 	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
 	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
 
@@ -222,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	brelse(bh_sr);
 	return err;
 
- failed_sufile:
-	nilfs_mdt_destroy(nilfs->ns_sufile);
-
  failed_cpfile:
-	nilfs_mdt_destroy(nilfs->ns_cpfile);
-
- failed_gc_dat:
-	nilfs_mdt_destroy(nilfs->ns_gc_dat);
+	iput(nilfs->ns_cpfile);
 
  failed_dat:
-	nilfs_mdt_destroy(nilfs->ns_dat);
+	iput(nilfs->ns_dat);
 	goto failed;
 }
 
@@ -248,6 +164,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
 }
 
 /**
+ * nilfs_store_log_cursor - load log cursor from a super block
+ * @nilfs: nilfs object
+ * @sbp: buffer storing super block to be read
+ *
+ * nilfs_store_log_cursor() reads the last position of the log
+ * containing a super root from a given super block, and initializes
+ * relevant information on the nilfs object preparatory for log
+ * scanning and recovery.
+ */
+static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
+				  struct nilfs_super_block *sbp)
+{
+	int ret = 0;
+
+	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+
+	nilfs->ns_prev_seq = nilfs->ns_last_seq;
+	nilfs->ns_seg_seq = nilfs->ns_last_seq;
+	nilfs->ns_segnum =
+		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+	nilfs->ns_cno = nilfs->ns_last_cno + 1;
+	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+		printk(KERN_ERR "NILFS invalid last segment number.\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
  * load_nilfs - load and recover the nilfs
  * @nilfs: the_nilfs structure to be released
  * @sbi: nilfs_sb_info used to recover past segment
@@ -264,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	int valid_fs = nilfs_valid_fs(nilfs);
 	int err;
 
-	if (nilfs_loaded(nilfs)) {
-		if (valid_fs ||
-		    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
-			return 0;
-		printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
-		       "recovery state.\n");
-		return -EINVAL;
-	}
-
 	if (!valid_fs) {
 		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
 		if (s_flags & MS_RDONLY) {
@@ -285,13 +223,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 
 	nilfs_init_recovery_info(&ri);
 
-	err = nilfs_search_super_root(nilfs, sbi, &ri);
+	err = nilfs_search_super_root(nilfs, &ri);
 	if (unlikely(err)) {
-		printk(KERN_ERR "NILFS: error searching super root.\n");
-		goto failed;
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		int blocksize;
+
+		if (err != -EINVAL)
+			goto scan_error;
+
+		if (!nilfs_valid_sb(sbp[1])) {
+			printk(KERN_WARNING
+			       "NILFS warning: unable to fall back to spare"
+			       "super block\n");
+			goto scan_error;
+		}
+		printk(KERN_INFO
+		       "NILFS: try rollback from an earlier position\n");
+
+		/*
+		 * restore super block with its spare and reconfigure
+		 * relevant states of the nilfs object.
+		 */
+		memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+		nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
+		nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+
+		/* verify consistency between two super blocks */
+		blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+		if (blocksize != nilfs->ns_blocksize) {
+			printk(KERN_WARNING
+			       "NILFS warning: blocksize differs between "
+			       "two super blocks (%d != %d)\n",
+			       blocksize, nilfs->ns_blocksize);
+			goto scan_error;
+		}
+
+		err = nilfs_store_log_cursor(nilfs, sbp[0]);
+		if (err)
+			goto scan_error;
+
+		/* drop clean flag to allow roll-forward and recovery */
+		nilfs->ns_mount_state &= ~NILFS_VALID_FS;
+		valid_fs = 0;
+
+		err = nilfs_search_super_root(nilfs, &ri);
+		if (err)
+			goto scan_error;
 	}
 
-	err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+	err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: error loading super root.\n");
 		goto failed;
@@ -301,11 +281,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 		goto skip_recovery;
 
 	if (s_flags & MS_RDONLY) {
+		__u64 features;
+
 		if (nilfs_test_opt(sbi, NORECOVERY)) {
 			printk(KERN_INFO "NILFS: norecovery option specified. "
 			       "skipping roll-forward recovery\n");
 			goto skip_recovery;
 		}
+		features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+			~NILFS_FEATURE_COMPAT_RO_SUPP;
+		if (features) {
+			printk(KERN_ERR "NILFS: couldn't proceed with "
+			       "recovery because of unsupported optional "
+			       "features (%llx)\n",
+			       (unsigned long long)features);
+			err = -EROFS;
+			goto failed_unload;
+		}
 		if (really_read_only) {
 			printk(KERN_ERR "NILFS: write access "
 			       "unavailable, cannot proceed.\n");
@@ -320,14 +312,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 		goto failed_unload;
 	}
 
-	err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+	err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
 	if (err)
 		goto failed_unload;
 
 	down_write(&nilfs->ns_sem);
-	nilfs->ns_mount_state |= NILFS_VALID_FS;
-	nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
-	err = nilfs_commit_super(sbi, 1);
+	nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
+	err = nilfs_cleanup_super(sbi);
 	up_write(&nilfs->ns_sem);
 
 	if (err) {
@@ -343,10 +334,14 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	sbi->s_super->s_flags = s_flags;
 	return 0;
 
+ scan_error:
+	printk(KERN_ERR "NILFS: error searching super root.\n");
+	goto failed;
+
  failed_unload:
-	nilfs_mdt_destroy(nilfs->ns_cpfile);
-	nilfs_mdt_destroy(nilfs->ns_sufile);
-	nilfs_mdt_destroy(nilfs->ns_dat);
+	iput(nilfs->ns_cpfile);
+	iput(nilfs->ns_sufile);
+	iput(nilfs->ns_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
@@ -368,8 +363,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 				   struct nilfs_super_block *sbp)
 {
-	if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
-		printk(KERN_ERR "NILFS: revision mismatch "
+	if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
+		printk(KERN_ERR "NILFS: unsupported revision "
 		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
 		       "Please check the version of mkfs.nilfs.\n",
 		       le32_to_cpu(sbp->s_rev_level),
@@ -509,14 +504,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 		return -EINVAL;
 	}
 
-	if (swp) {
+	if (!valid[!swp])
 		printk(KERN_WARNING "NILFS warning: broken superblock. "
 		       "using spare superblock.\n");
+	if (swp)
 		nilfs_swap_super_block(nilfs);
-	}
 
-	nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
-	nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+	nilfs->ns_sbwcount = 0;
+	nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
 	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
 	*sbpp = sbp[0];
 	return 0;
@@ -531,12 +526,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
  *
  * init_nilfs() performs common initialization per block device (e.g.
  * reading the super block, getting disk layout information, initializing
- * shared fields in the_nilfs). It takes on some portion of the jobs
- * typically done by a fill_super() routine. This division arises from
- * the nature that multiple NILFS instances may be simultaneously
- * mounted on a device.
- * For multiple mounts on the same device, only the first mount
- * invokes these tasks.
+ * shared fields in the_nilfs).
  *
  * Return Value: On success, 0 is returned. On error, a negative error
  * code is returned.
@@ -545,30 +535,12 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 {
 	struct super_block *sb = sbi->s_super;
 	struct nilfs_super_block *sbp;
-	struct backing_dev_info *bdi;
 	int blocksize;
 	int err;
 
 	down_write(&nilfs->ns_sem);
-	if (nilfs_init(nilfs)) {
-		/* Load values from existing the_nilfs */
-		sbp = nilfs->ns_sbp[0];
-		err = nilfs_store_magic_and_option(sb, sbp, data);
-		if (err)
-			goto out;
-
-		blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
-		if (sb->s_blocksize != blocksize &&
-		    !sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
-			       blocksize);
-			err = -EINVAL;
-		}
-		sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
-		goto out;
-	}
 
-	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "NILFS: unable to set blocksize\n");
 		err = -EINVAL;
@@ -582,7 +554,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	if (err)
 		goto failed_sbh;
 
+	err = nilfs_check_feature_compatibility(sb, sbp);
+	if (err)
+		goto failed_sbh;
+
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "filesystem blocksize %d\n", blocksize);
+		err = -EINVAL;
+		goto failed_sbh;
+	}
 	if (sb->s_blocksize != blocksize) {
 		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
@@ -604,6 +587,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 			   when reloading fails. */
 	}
 	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+	nilfs->ns_blocksize = blocksize;
 
 	err = nilfs_store_disk_layout(nilfs, sbp);
 	if (err)
@@ -613,29 +597,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
 
-	bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
-
-	/* Finding last segment */
-	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
-	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
-	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
-
-	nilfs->ns_seg_seq = nilfs->ns_last_seq;
-	nilfs->ns_segnum =
-		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
-	nilfs->ns_cno = nilfs->ns_last_cno + 1;
-	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
-		printk(KERN_ERR "NILFS invalid last segment number.\n");
-		err = -EINVAL;
-		goto failed_sbh;
-	}
-	/* Dummy values  */
-	nilfs->ns_free_segments_count =
-		nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
-
-	/* Initialize gcinode cache */
-	err = nilfs_init_gccache(nilfs);
+	err = nilfs_store_log_cursor(nilfs, sbp);
 	if (err)
 		goto failed_sbh;
 
@@ -673,8 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS,
-						   BLKDEV_IFL_BARRIER);
+						   GFP_NOFS, 0);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -684,7 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, BLKDEV_IFL_BARRIER);
+					   GFP_NOFS, 0);
 	return ret;
 }
 
@@ -711,79 +672,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
 	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
 
-/**
- * nilfs_find_sbinfo - find existing nilfs_sb_info structure
- * @nilfs: nilfs object
- * @rw_mount: mount type (non-zero value for read/write mount)
- * @cno: checkpoint number (zero for read-only mount)
- *
- * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
- * @rw_mount and @cno (in case of snapshots) matched.  If no instance
- * was found, NULL is returned.  Although the super block instance can
- * be unmounted after this function returns, the nilfs_sb_info struct
- * is kept on memory until nilfs_put_sbinfo() is called.
- */
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
-					int rw_mount, __u64 cno)
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
 {
-	struct nilfs_sb_info *sbi;
-
-	down_read(&nilfs->ns_super_sem);
-	/*
-	 * The SNAPSHOT flag and sb->s_flags are supposed to be
-	 * protected with nilfs->ns_super_sem.
-	 */
-	sbi = nilfs->ns_current;
-	if (rw_mount) {
-		if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read/write mount */
-		else
-			goto out;
-	} else if (cno == 0) {
-		if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
-			goto found; /* read-only mount */
-		else
-			goto out;
+	struct rb_node *n;
+	struct nilfs_root *root;
+
+	spin_lock(&nilfs->ns_cptree_lock);
+	n = nilfs->ns_cptree.rb_node;
+	while (n) {
+		root = rb_entry(n, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			n = n->rb_left;
+		} else if (cno > root->cno) {
+			n = n->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			return root;
+		}
 	}
+	spin_unlock(&nilfs->ns_cptree_lock);
 
-	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-		if (nilfs_test_opt(sbi, SNAPSHOT) &&
-		    sbi->s_snapshot_cno == cno)
-			goto found; /* snapshot mount */
-	}
- out:
-	up_read(&nilfs->ns_super_sem);
 	return NULL;
-
- found:
-	atomic_inc(&sbi->s_count);
-	up_read(&nilfs->ns_super_sem);
-	return sbi;
 }
 
-int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
-				int snapshot_mount)
+struct nilfs_root *
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 {
-	struct nilfs_sb_info *sbi;
-	int ret = 0;
+	struct rb_node **p, *parent;
+	struct nilfs_root *root, *new;
+
+	root = nilfs_lookup_root(nilfs, cno);
+	if (root)
+		return root;
+
+	new = kmalloc(sizeof(*root), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	spin_lock(&nilfs->ns_cptree_lock);
 
-	down_read(&nilfs->ns_super_sem);
-	if (cno == 0 || cno > nilfs->ns_cno)
-		goto out_unlock;
+	p = &nilfs->ns_cptree.rb_node;
+	parent = NULL;
 
-	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-		if (sbi->s_snapshot_cno == cno &&
-		    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
-					/* exclude read-only mounts */
-			ret++;
-			break;
+	while (*p) {
+		parent = *p;
+		root = rb_entry(parent, struct nilfs_root, rb_node);
+
+		if (cno < root->cno) {
+			p = &(*p)->rb_left;
+		} else if (cno > root->cno) {
+			p = &(*p)->rb_right;
+		} else {
+			atomic_inc(&root->count);
+			spin_unlock(&nilfs->ns_cptree_lock);
+			kfree(new);
+			return root;
 		}
 	}
-	/* for protecting recent checkpoints */
-	if (cno >= nilfs_last_cno(nilfs))
-		ret++;
 
- out_unlock:
-	up_read(&nilfs->ns_super_sem);
-	return ret;
+	new->cno = cno;
+	new->ifile = NULL;
+	new->nilfs = nilfs;
+	atomic_set(&new->count, 1);
+	atomic_set(&new->inodes_count, 0);
+	atomic_set(&new->blocks_count, 0);
+
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+
+	spin_unlock(&nilfs->ns_cptree_lock);
+
+	return new;
+}
+
+void nilfs_put_root(struct nilfs_root *root)
+{
+	if (atomic_dec_and_test(&root->count)) {
+		struct the_nilfs *nilfs = root->nilfs;
+
+		spin_lock(&nilfs->ns_cptree_lock);
+		rb_erase(&root->rb_node, &nilfs->ns_cptree);
+		spin_unlock(&nilfs->ns_cptree_lock);
+		if (root->ifile)
+			iput(root->ifile);
+
+		kfree(root);
+	}
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab97453369..69226e14b74 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/buffer_head.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -45,21 +46,13 @@ enum {
 /**
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
- * @ns_count: reference count
- * @ns_list: list head for nilfs_list
  * @ns_bdev: block device
- * @ns_bdi: backing dev info
- * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
- * @ns_super_sem: semaphore for global operations across super block instances
- * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
- * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbwtime: previous write time of super block
+ * @ns_sbwcount: write count of super block
  * @ns_sbsize: size of valid data in super block
- * @ns_supers: list of nilfs super block structs
  * @ns_seg_seq: segment sequence counter
  * @ns_segnum: index number of the latest full segment.
  * @ns_nextnum: index number of the full segment index to be used next
@@ -73,15 +66,16 @@ enum {
  * @ns_last_seq: sequence value of the latest segment
  * @ns_last_cno: checkpoint number of the latest segment
  * @ns_prot_seq: least sequence number of segments which must not be reclaimed
- * @ns_free_segments_count: counter of free segments
+ * @ns_prev_seq: base sequence number used to decide if advance log cursor
  * @ns_segctor_sem: segment constructor semaphore
  * @ns_dat: DAT file inode
  * @ns_cpfile: checkpoint file inode
  * @ns_sufile: segusage file inode
- * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
  * @ns_gc_inodes: dummy inodes to keep live blocks
- * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
  * @ns_blocksize_bits: bit length of block size
+ * @ns_blocksize: block size
  * @ns_nsegments: number of segments in filesystem
  * @ns_blocks_per_segment: number of blocks per segment
  * @ns_r_segments_percentage: reserved segments percentage
@@ -93,22 +87,9 @@ enum {
  */
 struct the_nilfs {
 	unsigned long		ns_flags;
-	atomic_t		ns_count;
-	struct list_head	ns_list;
 
 	struct block_device    *ns_bdev;
-	struct backing_dev_info *ns_bdi;
-	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
-	struct rw_semaphore	ns_super_sem;
-	struct mutex		ns_mount_mutex;
-	struct rw_semaphore	ns_writer_sem;
-
-	/*
-	 * components protected by ns_super_sem
-	 */
-	struct nilfs_sb_info   *ns_current;
-	struct list_head	ns_supers;
 
 	/*
 	 * used for
@@ -119,7 +100,8 @@ struct the_nilfs {
 	 */
 	struct buffer_head     *ns_sbh[2];
 	struct nilfs_super_block *ns_sbp[2];
-	time_t			ns_sbwtime[2];
+	time_t			ns_sbwtime;
+	unsigned		ns_sbwcount;
 	unsigned		ns_sbsize;
 	unsigned		ns_mount_state;
 
@@ -149,7 +131,7 @@ struct the_nilfs {
 	u64			ns_last_seq;
 	__u64			ns_last_cno;
 	u64			ns_prot_seq;
-	unsigned long		ns_free_segments_count;
+	u64			ns_prev_seq;
 
 	struct rw_semaphore	ns_segctor_sem;
 
@@ -160,14 +142,17 @@ struct the_nilfs {
 	struct inode	       *ns_dat;
 	struct inode	       *ns_cpfile;
 	struct inode	       *ns_sufile;
-	struct inode	       *ns_gc_dat;
 
-	/* GC inode list and hash table head */
+	/* Checkpoint tree */
+	struct rb_root		ns_cptree;
+	spinlock_t		ns_cptree_lock;
+
+	/* GC inode list */
 	struct list_head	ns_gc_inodes;
-	struct hlist_head      *ns_gc_inodes_h;
 
 	/* Disk layout information (static) */
 	unsigned int		ns_blocksize_bits;
+	unsigned int		ns_blocksize;
 	unsigned long		ns_nsegments;
 	unsigned long		ns_blocks_per_segment;
 	unsigned long		ns_r_segments_percentage;
@@ -178,9 +163,6 @@ struct the_nilfs {
 	u32			ns_crc_seed;
 };
 
-#define NILFS_GCINODE_HASH_BITS		8
-#define NILFS_GCINODE_HASH_SIZE		(1<<NILFS_GCINODE_HASH_BITS)
-
 #define THE_NILFS_FNS(bit, name)					\
 static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
 {									\
@@ -201,65 +183,67 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @root: root inode
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks (Reserved)
+ */
+struct nilfs_root {
+	__u64 cno;
+	struct rb_node rb_node;
+
+	atomic_t count;
+	struct the_nilfs *nilfs;
+	struct inode *ifile;
+
+	atomic_t inodes_count;
+	atomic_t blocks_count;
+};
+
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO	0
+
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
-#define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
 static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
 {
 	u64 t = get_seconds();
-	return t < nilfs->ns_sbwtime[0] ||
-		 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+	return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
 }
 
-static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 {
-	u64 t = get_seconds();
-	struct nilfs_super_block **sbp = nilfs->ns_sbp;
-	return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+	int flip_bits = nilfs->ns_sbwcount & 0x0FL;
+	return (flip_bits != 0x08 && flip_bits != 0x0F);
 }
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *find_or_create_nilfs(struct block_device *);
-void put_nilfs(struct the_nilfs *);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+					     __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
-int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
 
 
-static inline void get_nilfs(struct the_nilfs *nilfs)
-{
-	/* Caller must have at least one reference of the_nilfs. */
-	atomic_inc(&nilfs->ns_count);
-}
-
-static inline void
-nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-	down_write(&nilfs->ns_writer_sem);
-	nilfs->ns_writer = sbi;
-	up_write(&nilfs->ns_writer_sem);
-}
-
-static inline void
-nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-	down_write(&nilfs->ns_writer_sem);
-	if (sbi == nilfs->ns_writer)
-		nilfs->ns_writer = NULL;
-	up_write(&nilfs->ns_writer_sem);
-}
-
-static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+static inline void nilfs_get_root(struct nilfs_root *root)
 {
-	if (atomic_dec_and_test(&sbi->s_count))
-		kfree(sbi);
+	atomic_inc(&root->count);
 }
 
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d346..6e40e42a43d 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
 
 const struct file_operations def_blk_fops = {
 	.open		= no_blkdev_open,
+	.llseek		= noop_llseek,
 };
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d0..22c629eedd8 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c4..ae5f33a6d86 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
+				   mark.o vfsmount_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
+obj-y			+= fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964d..3344bdd5506 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
 int dir_notify_enable __read_mostly = 1;
 
 static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_cache __read_mostly;
 static struct fsnotify_group *dnotify_group __read_mostly;
 static DEFINE_MUTEX(dnotify_mark_mutex);
 
 /*
- * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * dnotify will attach one of these to each inode (i_fsnotify_marks) which
  * is being watched by dnotify.  If multiple userspace applications are watching
  * the same directory with dnotify their information is chained in dn
  */
-struct dnotify_mark_entry {
-	struct fsnotify_mark_entry fsn_entry;
+struct dnotify_mark {
+	struct fsnotify_mark fsn_mark;
 	struct dnotify_struct *dn;
 };
 
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
  * it calls the fsnotify function so it can update the set of all events relevant
  * to this inode.
  */
-static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
+static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 {
 	__u32 new_mask, old_mask;
 	struct dnotify_struct *dn;
-	struct dnotify_mark_entry *dnentry  = container_of(entry,
-							   struct dnotify_mark_entry,
-							   fsn_entry);
+	struct dnotify_mark *dn_mark  = container_of(fsn_mark,
+						     struct dnotify_mark,
+						     fsn_mark);
 
-	assert_spin_locked(&entry->lock);
+	assert_spin_locked(&fsn_mark->lock);
 
-	old_mask = entry->mask;
+	old_mask = fsn_mark->mask;
 	new_mask = 0;
-	for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
+	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
 		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-	entry->mask = new_mask;
+	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
 
 	if (old_mask == new_mask)
 		return;
 
-	if (entry->inode)
-		fsnotify_recalc_inode_mask(entry->inode);
+	if (fsn_mark->i.inode)
+		fsnotify_recalc_inode_mask(fsn_mark->i.inode);
 }
 
 /*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
  * events.
  */
 static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
 				struct fsnotify_event *event)
 {
-	struct fsnotify_mark_entry *entry = NULL;
-	struct dnotify_mark_entry *dnentry;
+	struct dnotify_mark *dn_mark;
 	struct inode *to_tell;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct fown_struct *fown;
 	__u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
 
-	to_tell = event->to_tell;
+	BUG_ON(vfsmount_mark);
 
-	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
-	spin_unlock(&to_tell->i_lock);
+	to_tell = event->to_tell;
 
-	/* unlikely since we alreay passed dnotify_should_send_event() */
-	if (unlikely(!entry))
-		return 0;
-	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
 
-	spin_lock(&entry->lock);
-	prev = &dnentry->dn;
+	spin_lock(&inode_mark->lock);
+	prev = &dn_mark->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_mask & test_mask) == 0) {
 			prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 		else {
 			*prev = dn->dn_next;
 			kmem_cache_free(dnotify_struct_cache, dn);
-			dnotify_recalc_inode_mask(entry);
+			dnotify_recalc_inode_mask(inode_mark);
 		}
 	}
 
-	spin_unlock(&entry->lock);
-	fsnotify_put_mark(entry);
+	spin_unlock(&inode_mark->lock);
 
 	return 0;
 }
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  * userspace notification for that pair.
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask)
+				      struct inode *inode,
+				      struct fsnotify_mark *inode_mark,
+				      struct fsnotify_mark *vfsmount_mark,
+				      __u32 mask, void *data, int data_type)
 {
-	struct fsnotify_mark_entry *entry;
-	bool send;
-
-	/* !dir_notify_enable should never get here, don't waste time checking
-	if (!dir_notify_enable)
-		return 0; */
-
 	/* not a dir, dnotify doesn't care */
 	if (!S_ISDIR(inode->i_mode))
 		return false;
 
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-
-	/* no mark means no dnotify watch */
-	if (!entry)
-		return false;
-
-	mask = (mask & ~FS_EVENT_ON_CHILD);
-	send = (mask & entry->mask);
-
-	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
-
-	return send;
+	return true;
 }
 
-static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-	struct dnotify_mark_entry *dnentry = container_of(entry,
-							  struct dnotify_mark_entry,
-							  fsn_entry);
+	struct dnotify_mark *dn_mark = container_of(fsn_mark,
+						    struct dnotify_mark,
+						    fsn_mark);
 
-	BUG_ON(dnentry->dn);
+	BUG_ON(dn_mark->dn);
 
-	kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+	kmem_cache_free(dnotify_mark_cache, dn_mark);
 }
 
 static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 
 /*
  * Called every time a file is closed.  Looks first for a dnotify mark on the
- * inode.  If one is found run all of the ->dn entries attached to that
+ * inode.  If one is found run all of the ->dn structures attached to that
  * mark for one relevant to this process closing the file and remove that
  * dnotify_struct.  If that was the last dnotify_struct also remove the
- * fsnotify_mark_entry.
+ * fsnotify_mark.
  */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
-	struct fsnotify_mark_entry *entry;
-	struct dnotify_mark_entry *dnentry;
+	struct fsnotify_mark *fsn_mark;
+	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
-	spin_unlock(&inode->i_lock);
-	if (!entry)
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (!fsn_mark)
 		return;
-	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 
 	mutex_lock(&dnotify_mark_mutex);
 
-	spin_lock(&entry->lock);
-	prev = &dnentry->dn;
+	spin_lock(&fsn_mark->lock);
+	prev = &dn_mark->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
 			*prev = dn->dn_next;
 			kmem_cache_free(dnotify_struct_cache, dn);
-			dnotify_recalc_inode_mask(entry);
+			dnotify_recalc_inode_mask(fsn_mark);
 			break;
 		}
 		prev = &dn->dn_next;
 	}
 
-	spin_unlock(&entry->lock);
+	spin_unlock(&fsn_mark->lock);
 
 	/* nothing else could have found us thanks to the dnotify_mark_mutex */
-	if (dnentry->dn == NULL)
-		fsnotify_destroy_mark_by_entry(entry);
-
-	fsnotify_recalc_group_mask(dnotify_group);
+	if (dn_mark->dn == NULL)
+		fsnotify_destroy_mark(fsn_mark);
 
 	mutex_unlock(&dnotify_mark_mutex);
 
-	fsnotify_put_mark(entry);
+	fsnotify_put_mark(fsn_mark);
 }
 
 /* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
 
 /*
  * If multiple processes watch the same inode with dnotify there is only one
- * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
  * onto that mark.  This function either attaches the new dnotify_struct onto
  * that list, or it |= the mask onto an existing dnofiy_struct.
  */
-static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
 		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
 {
 	struct dnotify_struct *odn;
 
-	odn = dnentry->dn;
+	odn = dn_mark->dn;
 	while (odn != NULL) {
 		/* adding more events to existing dnofiy_struct? */
 		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
 	dn->dn_fd = fd;
 	dn->dn_filp = filp;
 	dn->dn_owner = id;
-	dn->dn_next = dnentry->dn;
-	dnentry->dn = dn;
+	dn->dn_next = dn_mark->dn;
+	dn_mark->dn = dn;
 
 	return 0;
 }
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
  */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
-	struct dnotify_mark_entry *new_dnentry, *dnentry;
-	struct fsnotify_mark_entry *new_entry, *entry;
+	struct dnotify_mark *new_dn_mark, *dn_mark;
+	struct fsnotify_mark *new_fsn_mark, *fsn_mark;
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	__u32 mask;
 
 	/* we use these to tell if we need to kfree */
-	new_entry = NULL;
+	new_fsn_mark = NULL;
 	dn = NULL;
 
 	if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	}
 
 	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
-	new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-	if (!new_dnentry) {
+	new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
+	if (!new_dn_mark) {
 		error = -ENOMEM;
 		goto out_err;
 	}
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
 	mask = convert_arg(arg);
 
-	/* set up the new_entry and new_dnentry */
-	new_entry = &new_dnentry->fsn_entry;
-	fsnotify_init_mark(new_entry, dnotify_free_mark);
-	new_entry->mask = mask;
-	new_dnentry->dn = NULL;
+	/* set up the new_fsn_mark and new_dn_mark */
+	new_fsn_mark = &new_dn_mark->fsn_mark;
+	fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+	new_fsn_mark->mask = mask;
+	new_dn_mark->dn = NULL;
 
 	/* this is needed to prevent the fcntl/close race described below */
 	mutex_lock(&dnotify_mark_mutex);
 
-	/* add the new_entry or find an old one. */
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
-	spin_unlock(&inode->i_lock);
-	if (entry) {
-		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-		spin_lock(&entry->lock);
+	/* add the new_fsn_mark or find an old one. */
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	if (fsn_mark) {
+		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark(new_entry, dnotify_group, inode);
-		spin_lock(&new_entry->lock);
-		entry = new_entry;
-		dnentry = new_dnentry;
-		/* we used new_entry, so don't free it */
-		new_entry = NULL;
+		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+		spin_lock(&new_fsn_mark->lock);
+		fsn_mark = new_fsn_mark;
+		dn_mark = new_dn_mark;
+		/* we used new_fsn_mark, so don't free it */
+		new_fsn_mark = NULL;
 	}
 
 	rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* if (f != filp) means that we lost a race and another task/thread
 	 * actually closed the fd we are still playing with before we grabbed
-	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
-	 * only time we clean up the mark entries we need to get our mark off
+	 * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
+	 * only time we clean up the marks we need to get our mark off
 	 * the list. */
 	if (f != filp) {
 		/* if we added ourselves, shoot ourselves, it's possible that
-		 * the flush actually did shoot this entry.  That's fine too
+		 * the flush actually did shoot this fsn_mark.  That's fine too
 		 * since multiple calls to destroy_mark is perfectly safe, if
-		 * we found a dnentry already attached to the inode, just sod
+		 * we found a dn_mark already attached to the inode, just sod
 		 * off silently as the flush at close time dealt with it.
 		 */
-		if (dnentry == new_dnentry)
+		if (dn_mark == new_dn_mark)
 			destroy = 1;
 		goto out;
 	}
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 	if (error) {
 		/* if we added, we must shoot */
-		if (dnentry == new_dnentry)
+		if (dn_mark == new_dn_mark)
 			destroy = 1;
 		goto out;
 	}
 
-	error = attach_dn(dn, dnentry, id, fd, filp, mask);
-	/* !error means that we attached the dn to the dnentry, so don't free it */
+	error = attach_dn(dn, dn_mark, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dn_mark, so don't free it */
 	if (!error)
 		dn = NULL;
 	/* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	else if (error == -EEXIST)
 		error = 0;
 
-	dnotify_recalc_inode_mask(entry);
+	dnotify_recalc_inode_mask(fsn_mark);
 out:
-	spin_unlock(&entry->lock);
+	spin_unlock(&fsn_mark->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark_by_entry(entry);
-
-	fsnotify_recalc_group_mask(dnotify_group);
+		fsnotify_destroy_mark(fsn_mark);
 
 	mutex_unlock(&dnotify_mark_mutex);
-	fsnotify_put_mark(entry);
+	fsnotify_put_mark(fsn_mark);
 out_err:
-	if (new_entry)
-		fsnotify_put_mark(new_entry);
+	if (new_fsn_mark)
+		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
 	return error;
@@ -430,10 +400,9 @@ out_err:
 static int __init dnotify_init(void)
 {
 	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+	dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
 
-	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
-					      0, &dnotify_fsnotify_ops);
+	dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
 	if (IS_ERR(dnotify_group))
 		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 00000000000..3ac36b7bf6b
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
+config FANOTIFY
+	bool "Filesystem wide access notification"
+	select FSNOTIFY
+	select ANON_INODES
+	default n
+	---help---
+	   Say Y here to enable fanotify suport.  fanotify is a file access
+	   notification system which differs from inotify in that it sends
+	   and open file descriptor to the userspace listener along with
+	   the event.
+
+	   If unsure, say Y.
+
+config FANOTIFY_ACCESS_PERMISSIONS
+	bool "fanotify permissions checking"
+	depends on FANOTIFY
+	depends on SECURITY
+	default n
+	---help---
+	   Say Y here is you want fanotify listeners to be able to make permissions
+	   decisions concerning filesystem events.  This is used by some fanotify
+	   listeners which need to scan files before allowing the system access to
+	   use those files.  This is used by some anti-malware vendors and by some
+	   hierarchical storage managent systems.
+
+	   If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 00000000000..0999213e7e6
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 00000000000..b04f88eed09
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,224 @@
+#include <linux/fanotify.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h> /* UINT_MAX */
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+
+	if (old->to_tell == new->to_tell &&
+	    old->data_type == new->data_type &&
+	    old->tgid == new->tgid) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+		case (FSNOTIFY_EVENT_NONE):
+			return true;
+		default:
+			BUG();
+		};
+	}
+	return false;
+}
+
+/* and the list better be locked by something too! */
+static struct fsnotify_event *fanotify_merge(struct list_head *list,
+					     struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *test_holder;
+	struct fsnotify_event *test_event = NULL;
+	struct fsnotify_event *new_event;
+
+	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+
+
+	list_for_each_entry_reverse(test_holder, list, event_list) {
+		if (should_merge(test_holder->event, event)) {
+			test_event = test_holder->event;
+			break;
+		}
+	}
+
+	if (!test_event)
+		return NULL;
+
+	fsnotify_get_event(test_event);
+
+	/* if they are exactly the same we are done */
+	if (test_event->mask == event->mask)
+		return test_event;
+
+	/*
+	 * if the refcnt == 2 this is the only queue
+	 * for this event and so we can update the mask
+	 * in place.
+	 */
+	if (atomic_read(&test_event->refcnt) == 2) {
+		test_event->mask |= event->mask;
+		return test_event;
+	}
+
+	new_event = fsnotify_clone_event(test_event);
+
+	/* done with test_event */
+	fsnotify_put_event(test_event);
+
+	/* couldn't allocate memory, merge was not possible */
+	if (unlikely(!new_event))
+		return ERR_PTR(-ENOMEM);
+
+	/* build new event and replace it on the list */
+	new_event->mask = (test_event->mask | event->mask);
+	fsnotify_replace_event(test_holder, new_event);
+
+	/* we hold a reference on new_event from clone_event */
+	return new_event;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static int fanotify_get_response_from_access(struct fsnotify_group *group,
+					     struct fsnotify_event *event)
+{
+	int ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	wait_event(group->fanotify_data.access_waitq, event->response);
+
+	/* userspace responded, convert to something usable */
+	spin_lock(&event->lock);
+	switch (event->response) {
+	case FAN_ALLOW:
+		ret = 0;
+		break;
+	case FAN_DENY:
+	default:
+		ret = -EPERM;
+	}
+	event->response = 0;
+	spin_unlock(&event->lock);
+
+	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
+		 group, event, ret);
+	
+	return ret;
+}
+#endif
+
+static int fanotify_handle_event(struct fsnotify_group *group,
+				 struct fsnotify_mark *inode_mark,
+				 struct fsnotify_mark *fanotify_mark,
+				 struct fsnotify_event *event)
+{
+	int ret = 0;
+	struct fsnotify_event *notify_event = NULL;
+
+	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+	BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
+	if (IS_ERR(notify_event))
+		return PTR_ERR(notify_event);
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS) {
+		/* if we merged we need to wait on the new event */
+		if (notify_event)
+			event = notify_event;
+		ret = fanotify_get_response_from_access(group, event);
+	}
+#endif
+
+	if (notify_event)
+		fsnotify_put_event(notify_event);
+
+	return ret;
+}
+
+static bool fanotify_should_send_event(struct fsnotify_group *group,
+				       struct inode *to_tell,
+				       struct fsnotify_mark *inode_mark,
+				       struct fsnotify_mark *vfsmnt_mark,
+				       __u32 event_mask, void *data, int data_type)
+{
+	__u32 marks_mask, marks_ignored_mask;
+	struct path *path = data;
+
+	pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+
+	/* if we don't have enough info to send an event to userspace say no */
+	if (data_type != FSNOTIFY_EVENT_PATH)
+		return false;
+
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!S_ISREG(path->dentry->d_inode->i_mode) &&
+	    !S_ISDIR(path->dentry->d_inode->i_mode))
+		return false;
+
+	if (inode_mark && vfsmnt_mark) {
+		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
+		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
+	} else if (inode_mark) {
+		/*
+		 * if the event is for a child and this inode doesn't care about
+		 * events on the child, don't send it!
+		 */
+		if ((event_mask & FS_EVENT_ON_CHILD) &&
+		    !(inode_mark->mask & FS_EVENT_ON_CHILD))
+			return false;
+		marks_mask = inode_mark->mask;
+		marks_ignored_mask = inode_mark->ignored_mask;
+	} else if (vfsmnt_mark) {
+		marks_mask = vfsmnt_mark->mask;
+		marks_ignored_mask = vfsmnt_mark->ignored_mask;
+	} else {
+		BUG();
+	}
+
+	if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+	    (marks_ignored_mask & FS_ISDIR))
+		return false;
+
+	if (event_mask & marks_mask & ~marks_ignored_mask)
+		return true;
+
+	return false;
+}
+
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+	struct user_struct *user;
+
+	user = group->fanotify_data.user;
+	atomic_dec(&user->fanotify_listeners);
+	free_uid(user);
+}
+
+const struct fsnotify_ops fanotify_fsnotify_ops = {
+	.handle_event = fanotify_handle_event,
+	.should_send_event = fanotify_should_send_event,
+	.free_group_priv = fanotify_free_group_priv,
+	.free_event_priv = NULL,
+	.freeing_mark = NULL,
+};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 00000000000..063224812b7
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,870 @@
+#include <linux/fanotify.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <asm/ioctls.h>
+
+#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
+#define FANOTIFY_DEFAULT_MAX_MARKS	8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
+
+extern const struct fsnotify_ops fanotify_fsnotify_ops;
+
+static struct kmem_cache *fanotify_mark_cache __read_mostly;
+static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+
+struct fanotify_response_event {
+	struct list_head list;
+	__s32 fd;
+	struct fsnotify_event *event;
+};
+
+/*
+ * Get an fsnotify notification event if one exists and is small
+ * enough to fit in "count". Return an error pointer if the count
+ * is not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	if (FAN_EVENT_METADATA_LEN > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	return fsnotify_remove_notify_event(group);
+}
+
+static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	int client_fd;
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+	struct file *new_file;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	client_fd = get_unused_fd();
+	if (client_fd < 0)
+		return client_fd;
+
+	if (event->data_type != FSNOTIFY_EVENT_PATH) {
+		WARN_ON(1);
+		put_unused_fd(client_fd);
+		return -EINVAL;
+	}
+
+	/*
+	 * we need a new file handle for the userspace program so it can read even if it was
+	 * originally opened O_WRONLY.
+	 */
+	dentry = dget(event->path.dentry);
+	mnt = mntget(event->path.mnt);
+	/* it's possible this event was an overflow event.  in that case dentry and mnt
+	 * are NULL;  That's fine, just don't call dentry open */
+	if (dentry && mnt)
+		new_file = dentry_open(dentry, mnt,
+				       group->fanotify_data.f_flags | FMODE_NONOTIFY,
+				       current_cred());
+	else
+		new_file = ERR_PTR(-EOVERFLOW);
+	if (IS_ERR(new_file)) {
+		/*
+		 * we still send an event even if we can't open the file.  this
+		 * can happen when say tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * we just send the errno to userspace since there isn't much
+		 * else we can do.
+		 */
+		put_unused_fd(client_fd);
+		client_fd = PTR_ERR(new_file);
+	} else {
+		fd_install(client_fd, new_file);
+	}
+
+	return client_fd;
+}
+
+static ssize_t fill_event_metadata(struct fsnotify_group *group,
+				   struct fanotify_event_metadata *metadata,
+				   struct fsnotify_event *event)
+{
+	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
+		 group, metadata, event);
+
+	metadata->event_len = FAN_EVENT_METADATA_LEN;
+	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+	metadata->pid = pid_vnr(event->tgid);
+	metadata->fd = create_fd(group, event);
+
+	return metadata->fd;
+}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
+						  __s32 fd)
+{
+	struct fanotify_response_event *re, *return_re = NULL;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+	list_for_each_entry(re, &group->fanotify_data.access_list, list) {
+		if (re->fd != fd)
+			continue;
+
+		list_del_init(&re->list);
+		return_re = re;
+		break;
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	pr_debug("%s: found return_re=%p\n", __func__, return_re);
+
+	return return_re;
+}
+
+static int process_access_response(struct fsnotify_group *group,
+				   struct fanotify_response *response_struct)
+{
+	struct fanotify_response_event *re;
+	__s32 fd = response_struct->fd;
+	__u32 response = response_struct->response;
+
+	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
+		 fd, response);
+	/*
+	 * make sure the response is valid, if invalid we do nothing and either
+	 * userspace can send a valid responce or we will clean it up after the
+	 * timeout
+	 */
+	switch (response) {
+	case FAN_ALLOW:
+	case FAN_DENY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fd < 0)
+		return -EINVAL;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return -ENOENT;
+
+	re->event->response = response;
+
+	wake_up(&group->fanotify_data.access_waitq);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return 0;
+}
+
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return 0;
+
+	re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
+	if (!re)
+		return -ENOMEM;
+
+	re->event = event;
+	re->fd = fd;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+
+	if (group->fanotify_data.bypass_perm) {
+		mutex_unlock(&group->fanotify_data.access_mutex);
+		kmem_cache_free(fanotify_response_event_cache, re);
+		event->response = FAN_ALLOW;
+		return 0;
+	}
+		
+	list_add_tail(&re->list, &group->fanotify_data.access_list);
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return;
+
+	BUG_ON(re->event != event);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return;
+}
+#else
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	return;
+}
+#endif
+
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
+				  char __user *buf)
+{
+	struct fanotify_event_metadata fanotify_event_metadata;
+	int fd, ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+	if (fd < 0)
+		return fd;
+
+	ret = prepare_for_access_response(group, event, fd);
+	if (ret)
+		goto out_close_fd;
+
+	ret = -EFAULT;
+	if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+		goto out_kill_access_response;
+
+	return FAN_EVENT_METADATA_LEN;
+
+out_kill_access_response:
+	remove_access_response(group, event, fd);
+out_close_fd:
+	sys_close(fd);
+	return ret;
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int fanotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+static ssize_t fanotify_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	while (1) {
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+
+		if (start != buf)
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&group->notification_waitq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
+static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response response = { .fd = -1, .response = -1 };
+	struct fsnotify_group *group;
+	int ret;
+
+	group = file->private_data;
+
+	if (count > sizeof(response))
+		count = sizeof(response);
+
+	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
+
+	if (copy_from_user(&response, buf, count))
+		return -EFAULT;
+
+	ret = process_access_response(group, &response);
+	if (ret < 0)
+		count = ret;
+
+	return count;
+#else
+	return -EINVAL;
+#endif
+}
+
+static int fanotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response_event *re, *lre;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+
+	group->fanotify_data.bypass_perm = true;
+
+	list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+		pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+			 re, re->event);
+
+		list_del_init(&re->list);
+		re->event->response = FAN_ALLOW;
+
+		kmem_cache_free(fanotify_response_event_cache, re);
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	wake_up(&group->fanotify_data.access_waitq);
+#endif
+	/* matches the fanotify_init->fsnotify_alloc_group */
+	fsnotify_put_group(group);
+
+	return 0;
+}
+
+static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list)
+			send_len += FAN_EVENT_METADATA_LEN;
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations fanotify_fops = {
+	.poll		= fanotify_poll,
+	.read		= fanotify_read,
+	.write		= fanotify_write,
+	.fasync		= NULL,
+	.release	= fanotify_release,
+	.unlocked_ioctl	= fanotify_ioctl,
+	.compat_ioctl	= fanotify_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
+static int fanotify_find_path(int dfd, const char __user *filename,
+			      struct path *path, unsigned int flags)
+{
+	int ret;
+
+	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
+		 dfd, filename, flags);
+
+	if (filename == NULL) {
+		struct file *file;
+		int fput_needed;
+
+		ret = -EBADF;
+		file = fget_light(dfd, &fput_needed);
+		if (!file)
+			goto out;
+
+		ret = -ENOTDIR;
+		if ((flags & FAN_MARK_ONLYDIR) &&
+		    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
+			fput_light(file, fput_needed);
+			goto out;
+		}
+
+		*path = file->f_path;
+		path_get(path);
+		fput_light(file, fput_needed);
+	} else {
+		unsigned int lookup_flags = 0;
+
+		if (!(flags & FAN_MARK_DONT_FOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+		if (flags & FAN_MARK_ONLYDIR)
+			lookup_flags |= LOOKUP_DIRECTORY;
+
+		ret = user_path_at(dfd, filename, lookup_flags, path);
+		if (ret)
+			goto out;
+	}
+
+	/* you can only watch an inode if you have read permissions on it */
+	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret)
+		path_put(path);
+out:
+	return ret;
+}
+
+static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+					    __u32 mask,
+					    unsigned int flags)
+{
+	__u32 oldmask;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+	} else {
+		oldmask = fsn_mark->ignored_mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+	}
+	spin_unlock(&fsn_mark->lock);
+
+	if (!(oldmask & ~mask))
+		fsnotify_destroy_mark(fsn_mark);
+
+	return mask & oldmask;
+}
+
+static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
+					 struct vfsmount *mnt, __u32 mask,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+	fsnotify_put_mark(fsn_mark);
+	if (removed & mnt->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
+	return 0;
+}
+
+static int fanotify_remove_inode_mark(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark = NULL;
+	__u32 removed;
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+	/* matches the fsnotify_find_inode_mark() */
+	fsnotify_put_mark(fsn_mark);
+	if (removed & inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+
+	return 0;
+}
+
+static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+				       __u32 mask,
+				       unsigned int flags)
+{
+	__u32 oldmask = -1;
+
+	spin_lock(&fsn_mark->lock);
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+	} else {
+		__u32 tmask = fsn_mark->ignored_mask | mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+	}
+
+	if (!(flags & FAN_MARK_ONDIR)) {
+		__u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+	}
+
+	spin_unlock(&fsn_mark->lock);
+
+	return mask & ~oldmask;
+}
+
+static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
+				      struct vfsmount *mnt, __u32 mask,
+				      unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark) {
+		int ret;
+
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
+		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		if (!fsn_mark)
+			return -ENOMEM;
+
+		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
+		if (ret) {
+			fanotify_free_mark(fsn_mark);
+			return ret;
+		}
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	fsnotify_put_mark(fsn_mark);
+	if (added & ~mnt->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
+	return 0;
+}
+
+static int fanotify_add_inode_mark(struct fsnotify_group *group,
+				   struct inode *inode, __u32 mask,
+				   unsigned int flags)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 added;
+
+	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+
+	/*
+	 * If some other task has this inode open for write we should not add
+	 * an ignored mark, unless that ignored mark is supposed to survive
+	 * modification changes anyway.
+	 */
+	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    (atomic_read(&inode->i_writecount) > 0))
+		return 0;
+
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark) {
+		int ret;
+
+		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+			return -ENOSPC;
+
+		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		if (!fsn_mark)
+			return -ENOMEM;
+
+		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
+		if (ret) {
+			fanotify_free_mark(fsn_mark);
+			return ret;
+		}
+	}
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	fsnotify_put_mark(fsn_mark);
+	if (added & ~inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
+	return 0;
+}
+
+/* fanotify syscalls */
+SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+{
+	struct fsnotify_group *group;
+	int f_flags, fd;
+	struct user_struct *user;
+
+	pr_debug("%s: flags=%d event_f_flags=%d\n",
+		__func__, flags, event_f_flags);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FAN_ALL_INIT_FLAGS)
+		return -EINVAL;
+
+	user = get_current_user();
+	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+		free_uid(user);
+		return -EMFILE;
+	}
+
+	f_flags = O_RDWR | FMODE_NONOTIFY;
+	if (flags & FAN_CLOEXEC)
+		f_flags |= O_CLOEXEC;
+	if (flags & FAN_NONBLOCK)
+		f_flags |= O_NONBLOCK;
+
+	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
+	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	group->fanotify_data.user = user;
+	atomic_inc(&user->fanotify_listeners);
+
+	group->fanotify_data.f_flags = event_f_flags;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	mutex_init(&group->fanotify_data.access_mutex);
+	init_waitqueue_head(&group->fanotify_data.access_waitq);
+	INIT_LIST_HEAD(&group->fanotify_data.access_list);
+#endif
+	switch (flags & FAN_ALL_CLASS_BITS) {
+	case FAN_CLASS_NOTIF:
+		group->priority = FS_PRIO_0;
+		break;
+	case FAN_CLASS_CONTENT:
+		group->priority = FS_PRIO_1;
+		break;
+	case FAN_CLASS_PRE_CONTENT:
+		group->priority = FS_PRIO_2;
+		break;
+	default:
+		fd = -EINVAL;
+		goto out_put_group;
+	}
+
+	if (flags & FAN_UNLIMITED_QUEUE) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->max_events = UINT_MAX;
+	} else {
+		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	}
+
+	if (flags & FAN_UNLIMITED_MARKS) {
+		fd = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_put_group;
+		group->fanotify_data.max_marks = UINT_MAX;
+	} else {
+		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+	}
+
+	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+	if (fd < 0)
+		goto out_put_group;
+
+	return fd;
+
+out_put_group:
+	fsnotify_put_group(group);
+	return fd;
+}
+
+SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
+			      __u64 mask, int dfd,
+			      const char  __user * pathname)
+{
+	struct inode *inode = NULL;
+	struct vfsmount *mnt = NULL;
+	struct fsnotify_group *group;
+	struct file *filp;
+	struct path path;
+	int ret, fput_needed;
+
+	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
+		 __func__, fanotify_fd, flags, dfd, pathname, mask);
+
+	/* we only use the lower 32 bits as of right now. */
+	if (mask & ((__u64)0xffffffff << 32))
+		return -EINVAL;
+
+	if (flags & ~FAN_ALL_MARK_FLAGS)
+		return -EINVAL;
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	case FAN_MARK_ADD:
+	case FAN_MARK_REMOVE:
+	case FAN_MARK_FLUSH:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mask & FAN_ONDIR) {
+		flags |= FAN_MARK_ONDIR;
+		mask &= ~FAN_ONDIR;
+	}
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
+#else
+	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
+#endif
+		return -EINVAL;
+
+	filp = fget_light(fanotify_fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an fanotify instance */
+	ret = -EINVAL;
+	if (unlikely(filp->f_op != &fanotify_fops))
+		goto fput_and_out;
+	group = filp->private_data;
+
+	/*
+	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+	 * allowed to set permissions events.
+	 */
+	ret = -EINVAL;
+	if (mask & FAN_ALL_PERM_EVENTS &&
+	    group->priority == FS_PRIO_0)
+		goto fput_and_out;
+
+	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	if (!(flags & FAN_MARK_MOUNT))
+		inode = path.dentry->d_inode;
+	else
+		mnt = path.mnt;
+
+	/* create/update an inode mark */
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+	case FAN_MARK_ADD:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_add_inode_mark(group, inode, mask, flags);
+		break;
+	case FAN_MARK_REMOVE:
+		if (flags & FAN_MARK_MOUNT)
+			ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+		else
+			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+		break;
+	case FAN_MARK_FLUSH:
+		if (flags & FAN_MARK_MOUNT)
+			fsnotify_clear_vfsmount_marks_by_group(group);
+		else
+			fsnotify_clear_inode_marks_by_group(group);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
+				  long dfd, long pathname)
+{
+	return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
+				  mask, (int) dfd,
+				  (const char  __user *) pathname);
+}
+SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
+#endif
+
+/*
+ * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init fanotify_user_setup(void)
+{
+	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
+						   SLAB_PANIC);
+
+	return 0;
+}
+device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af8..20dc218707c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/srcu.h>
 
 #include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
 
+void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+	fsnotify_clear_marks_by_mount(mnt);
+}
+
 /*
  * Given an inode, first check if we care what happens to our children.  Inotify
  * and dnotify both tell their parents about events.  If we care about any event
@@ -78,112 +84,225 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
-	bool send = false;
-	bool should_update_children = false;
+	int ret = 0;
+
+	if (!dentry)
+		dentry = path->dentry;
 
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
-		return;
+		return 0;
 
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
+	parent = dget_parent(dentry);
 	p_inode = parent->d_inode;
 
-	if (fsnotify_inode_watches_children(p_inode)) {
-		if (p_inode->i_fsnotify_mask & mask) {
-			dget(parent);
-			send = true;
-		}
-	} else {
-		/*
-		 * The parent doesn't care about events on it's children but
-		 * at least one child thought it did.  We need to run all the
-		 * children and update their d_flags to let them know p_inode
-		 * doesn't care about them any more.
-		 */
-		dget(parent);
-		should_update_children = true;
-	}
-
-	spin_unlock(&dentry->d_lock);
-
-	if (send) {
+	if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+		__fsnotify_update_child_dentry_flags(p_inode);
+	else if (p_inode->i_fsnotify_mask & mask) {
 		/* we are notifying a parent so come up with the new mask which
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name, 0);
-		dput(parent);
+		if (path)
+			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+				       dentry->d_name.name, 0);
+		else
+			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+				       dentry->d_name.name, 0);
 	}
 
-	if (unlikely(should_update_children)) {
-		__fsnotify_update_child_dentry_flags(p_inode);
-		dput(parent);
-	}
+	dput(parent);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
+static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 __u32 mask, void *data,
+			 int data_is, u32 cookie,
+			 const unsigned char *file_name,
+			 struct fsnotify_event **event)
+{
+	struct fsnotify_group *group = NULL;
+	__u32 inode_test_mask = 0;
+	__u32 vfsmount_test_mask = 0;
+
+	if (unlikely(!inode_mark && !vfsmount_mark)) {
+		BUG();
+		return 0;
+	}
+
+	/* clear ignored on inode modification */
+	if (mask & FS_MODIFY) {
+		if (inode_mark &&
+		    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			inode_mark->ignored_mask = 0;
+		if (vfsmount_mark &&
+		    !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			vfsmount_mark->ignored_mask = 0;
+	}
+
+	/* does the inode mark tell us to do something? */
+	if (inode_mark) {
+		group = inode_mark->group;
+		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		inode_test_mask &= inode_mark->mask;
+		inode_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	/* does the vfsmount_mark tell us to do something? */
+	if (vfsmount_mark) {
+		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+		group = vfsmount_mark->group;
+		vfsmount_test_mask &= vfsmount_mark->mask;
+		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
+		if (inode_mark)
+			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+		 " data=%p data_is=%d cookie=%d event=%p\n",
+		 __func__, group, to_tell, mnt, mask, inode_mark,
+		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+		 data_is, cookie, *event);
+
+	if (!inode_test_mask && !vfsmount_test_mask)
+		return 0;
+
+	if (group->ops->should_send_event(group, to_tell, inode_mark,
+					  vfsmount_mark, mask, data,
+					  data_is) == false)
+		return 0;
+
+	if (!*event) {
+		*event = fsnotify_create_event(to_tell, mask, data,
+						data_is, file_name,
+						cookie, GFP_KERNEL);
+		if (!*event)
+			return -ENOMEM;
+	}
+	return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+}
+
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+	     const unsigned char *file_name, u32 cookie)
 {
-	struct fsnotify_group *group;
+	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
+	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
+	struct fsnotify_group *inode_group, *vfsmount_group;
 	struct fsnotify_event *event = NULL;
-	int idx;
+	struct vfsmount *mnt;
+	int idx, ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	if (list_empty(&fsnotify_groups))
-		return;
-
-	if (!(test_mask & fsnotify_mask))
-		return;
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		mnt = ((struct path *)data)->mnt;
+	else
+		mnt = NULL;
 
-	if (!(test_mask & to_tell->i_fsnotify_mask))
-		return;
 	/*
-	 * SRCU!!  the groups list is very very much read only and the path is
-	 * very hot.  The VAST majority of events are not going to need to do
-	 * anything other than walk the list so it's crazy to pre-allocate.
+	 * if this is a modify event we may need to clear the ignored masks
+	 * otherwise return if neither the inode nor the vfsmount care about
+	 * this type of event.
 	 */
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
-		if (test_mask & group->mask) {
-			if (!group->ops->should_send_event(group, to_tell, mask))
-				continue;
-			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data,
-							      data_is, file_name, cookie,
-							      GFP_KERNEL);
-				/* shit, we OOM'd and now we can't tell, maybe
-				 * someday someone else will want to do something
-				 * here */
-				if (!event)
-					break;
-			}
-			group->ops->handle_event(group, event);
+	if (!(mask & FS_MODIFY) &&
+	    !(test_mask & to_tell->i_fsnotify_mask) &&
+	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+		return 0;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
+
+	if ((mask & FS_MODIFY) ||
+	    (test_mask & to_tell->i_fsnotify_mask))
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+
+	if (mnt && ((mask & FS_MODIFY) ||
+		    (test_mask & mnt->mnt_fsnotify_mask))) {
+		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+						 &fsnotify_mark_srcu);
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
+	}
+
+	while (inode_node || vfsmount_node) {
+		inode_group = vfsmount_group = NULL;
+
+		if (inode_node) {
+			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
+						 struct fsnotify_mark, i.i_list);
+			inode_group = inode_mark->group;
+		}
+
+		if (vfsmount_node) {
+			vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
+							struct fsnotify_mark, m.m_list);
+			vfsmount_group = vfsmount_mark->group;
+		}
+
+		if (inode_group > vfsmount_group) {
+			/* handle inode */
+			ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+					    data_is, cookie, file_name, &event);
+			/* we didn't use the vfsmount_mark */
+			vfsmount_group = NULL;
+		} else if (vfsmount_group > inode_group) {
+			ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+					    data_is, cookie, file_name, &event);
+			inode_group = NULL;
+		} else {
+			ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+					    mask, data, data_is, cookie, file_name,
+					    &event);
 		}
+
+		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+			goto out;
+
+		if (inode_group)
+			inode_node = srcu_dereference(inode_node->next,
+						      &fsnotify_mark_srcu);
+		if (vfsmount_group)
+			vfsmount_node = srcu_dereference(vfsmount_node->next,
+							 &fsnotify_mark_srcu);
 	}
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	ret = 0;
+out:
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
 	/*
 	 * fsnotify_create_event() took a reference so the event can't be cleaned
 	 * up while we are still trying to add it to lists, drop that one.
 	 */
 	if (event)
 		fsnotify_put_event(event);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 
 static __init int fsnotify_init(void)
 {
-	return init_srcu_struct(&fsnotify_grp_srcu);
+	int ret;
+
+	BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+
+	ret = init_srcu_struct(&fsnotify_mark_srcu);
+	if (ret)
+		panic("initializing fsnotify_mark_srcu");
+
+	return 0;
 }
-subsys_initcall(fsnotify_init);
+core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2..85e7d2b431d 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
-/* protects reads of fsnotify_groups */
-extern struct srcu_struct fsnotify_grp_srcu;
-/* all groups which receive fsnotify events */
-extern struct list_head fsnotify_groups;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
-extern __u32 fsnotify_mask;
-
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
+/* protects reads of inode and vfsmount marks list */
+extern struct srcu_struct fsnotify_mark_srcu;
+
+extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
+						__u32 mask);
+/* add a mark to an inode */
+extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+				   struct fsnotify_group *group, struct inode *inode,
+				   int allow_dups);
+/* add a mark to a vfsmount */
+extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+				      struct fsnotify_group *group, struct vfsmount *mnt,
+				      int allow_dups);
+
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 
+/* vfsmount specific destruction of a mark */
+extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
+/* inode specific destruction of a mark */
+extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/* run the list of all marks associated with vfsmount and flag them to be freed */
+extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc..d309f38449c 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
 
 #include <asm/atomic.h>
 
-/* protects writes to fsnotify_groups and fsnotify_mask */
-static DEFINE_MUTEX(fsnotify_grp_mutex);
-/* protects reads while running the fsnotify_groups list */
-struct srcu_struct fsnotify_grp_srcu;
-/* all groups registered to receive filesystem notifications */
-LIST_HEAD(fsnotify_groups);
-/* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_mask;
-
-/*
- * When a new group registers or changes it's set of interesting events
- * this function updates the fsnotify_mask to contain all interesting events
- */
-void fsnotify_recalc_global_mask(void)
-{
-	struct fsnotify_group *group;
-	__u32 mask = 0;
-	int idx;
-
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
-		mask |= group->mask;
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
-	fsnotify_mask = mask;
-}
-
-/*
- * Update the group->mask by running all of the marks associated with this
- * group and finding the bitwise | of all of the mark->mask.  If we change
- * the group->mask we need to update the global mask of events interesting
- * to the system.
- */
-void fsnotify_recalc_group_mask(struct fsnotify_group *group)
-{
-	__u32 mask = 0;
-	__u32 old_mask = group->mask;
-	struct fsnotify_mark_entry *entry;
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry(entry, &group->mark_entries, g_list)
-		mask |= entry->mask;
-	spin_unlock(&group->mark_lock);
-
-	group->mask = mask;
-
-	if (old_mask != mask)
-		fsnotify_recalc_global_mask();
-}
-
-/*
- * Take a reference to a group so things found under the fsnotify_grp_mutex
- * can't get freed under us
- */
-static void fsnotify_get_group(struct fsnotify_group *group)
-{
-	atomic_inc(&group->refcnt);
-}
-
 /*
  * Final freeing of a group
  */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 static void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-	/* clear all inode mark entries for this group */
+	/* clear all inode marks for this group */
 	fsnotify_clear_marks_by_group(group);
 
+	synchronize_srcu(&fsnotify_mark_srcu);
+
 	/* past the point of no return, matches the initial value of 1 */
 	if (atomic_dec_and_test(&group->num_marks))
 		fsnotify_final_destroy_group(group);
 }
 
 /*
- * Remove this group from the global list of groups that will get events
- * this can be done even if there are still references and things still using
- * this group.  This just stops the group from getting new events.
- */
-static void __fsnotify_evict_group(struct fsnotify_group *group)
-{
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	if (group->on_group_list)
-		list_del_rcu(&group->group_list);
-	group->on_group_list = 0;
-}
-
-/*
- * Called when a group is no longer interested in getting events.  This can be
- * used if a group is misbehaving or if for some reason a group should no longer
- * get any filesystem events.
- */
-void fsnotify_evict_group(struct fsnotify_group *group)
-{
-	mutex_lock(&fsnotify_grp_mutex);
-	__fsnotify_evict_group(group);
-	mutex_unlock(&fsnotify_grp_mutex);
-}
-
-/*
  * Drop a reference to a group.  Free it if it's through.
  */
 void fsnotify_put_group(struct fsnotify_group *group)
 {
-	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
-		return;
-
-	/*
-	 * OK, now we know that there's no other users *and* we hold mutex,
-	 * so no new references will appear
-	 */
-	__fsnotify_evict_group(group);
-
-	/*
-	 * now it's off the list, so the only thing we might care about is
-	 * srcu access....
-	 */
-	mutex_unlock(&fsnotify_grp_mutex);
-	synchronize_srcu(&fsnotify_grp_srcu);
-
-	/* and now it is really dead. _Nothing_ could be seeing it */
-	fsnotify_recalc_global_mask();
-	fsnotify_destroy_group(group);
-}
-
-/*
- * Simply run the fsnotify_groups list and find a group which matches
- * the given parameters.  If a group is found we take a reference to that
- * group.
- */
-static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
-						  const struct fsnotify_ops *ops)
-{
-	struct fsnotify_group *group_iter;
-	struct fsnotify_group *group = NULL;
-
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
-		if (group_iter->group_num == group_num) {
-			if ((group_iter->mask == mask) &&
-			    (group_iter->ops == ops)) {
-				fsnotify_get_group(group_iter);
-				group = group_iter;
-			} else
-				group = ERR_PTR(-EEXIST);
-		}
-	}
-	return group;
+	if (atomic_dec_and_test(&group->refcnt))
+		fsnotify_destroy_group(group);
 }
 
 /*
- * Either finds an existing group which matches the group_num, mask, and ops or
- * creates a new group and adds it to the global group list.  In either case we
- * take a reference for the group returned.
+ * Create a new fsnotify_group and hold a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
-					     const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 {
-	struct fsnotify_group *group, *tgroup;
+	struct fsnotify_group *group;
 
-	/* very low use, simpler locking if we just always alloc */
-	group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
 
+	/* set to 0 when there a no external references to this group */
 	atomic_set(&group->refcnt, 1);
-
-	group->on_group_list = 0;
-	group->group_num = group_num;
-	group->mask = mask;
+	/*
+	 * hits 0 when there are no external references AND no marks for
+	 * this group
+	 */
+	atomic_set(&group->num_marks, 1);
 
 	mutex_init(&group->notification_mutex);
 	INIT_LIST_HEAD(&group->notification_list);
 	init_waitqueue_head(&group->notification_waitq);
-	group->q_len = 0;
 	group->max_events = UINT_MAX;
 
 	spin_lock_init(&group->mark_lock);
-	atomic_set(&group->num_marks, 0);
-	INIT_LIST_HEAD(&group->mark_entries);
+	INIT_LIST_HEAD(&group->marks_list);
 
 	group->ops = ops;
 
-	mutex_lock(&fsnotify_grp_mutex);
-	tgroup = fsnotify_find_group(group_num, mask, ops);
-	if (tgroup) {
-		/* group already exists */
-		mutex_unlock(&fsnotify_grp_mutex);
-		/* destroy the new one we made */
-		fsnotify_put_group(group);
-		return tgroup;
-	}
-
-	/* group not found, add a new one */
-	list_add_rcu(&group->group_list, &fsnotify_groups);
-	group->on_group_list = 1;
-	/* being on the fsnotify_groups list holds one num_marks */
-	atomic_inc(&group->num_marks);
-
-	mutex_unlock(&fsnotify_grp_mutex);
-
-	if (mask)
-		fsnotify_recalc_global_mask();
-
 	return group;
 }
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c..4c29fcf557d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-/*
- * fsnotify inode mark locking/lifetime/and refcnting
- *
- * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guarenteed to survive until the reference is dropped.
- *
- * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
- *
- * entry->lock
- * group->mark_lock
- * inode->i_lock
- *
- * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the mark_entries list anchored inside a given group
- * and each entry is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
- *
- * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
- * given inode and each entry is hooked via the i_list. (and sorta the
- * free_i_list)
- *
- *
- * LIFETIME:
- * Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
- *
- * The inode mark can be cleared for a number of different reasons including:
- * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
- * - The inode is being evicted from cache. (fsnotify_inode_delete)
- * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
- * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
- * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_clear_marks_by_group)
- *
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
- * Very similarly for freeing by group, except we use free_g_list.
- *
- * This has the very interesting property of being able to run concurrently with
- * any (or all) other directions.
- */
-
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -95,30 +29,19 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
-{
-	atomic_inc(&entry->refcnt);
-}
-
-void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
-{
-	if (atomic_dec_and_test(&entry->refcnt))
-		entry->free_mark(entry);
-}
-
 /*
  * Recalculate the mask of events relevant to a given inode locked.
  */
 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
 	__u32 new_mask = 0;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
-		new_mask |= entry->mask;
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
+		new_mask |= mark->mask;
 	inode->i_fsnotify_mask = new_mask;
 }
 
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	__fsnotify_update_child_dentry_flags(inode);
 }
 
-/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the entry->lock
- */
-void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 {
-	struct fsnotify_group *group;
-	struct inode *inode;
+	struct inode *inode = mark->i.inode;
 
-	spin_lock(&entry->lock);
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
 
-	group = entry->group;
-	inode = entry->inode;
-
-	BUG_ON(group && !inode);
-	BUG_ON(!group && inode);
-
-	/* if !group something else already marked this to die */
-	if (!group) {
-		spin_unlock(&entry->lock);
-		return;
-	}
-
-	/* 1 from caller and 1 for being on i_list/g_list */
-	BUG_ON(atomic_read(&entry->refcnt) < 2);
-
-	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
-	hlist_del_init(&entry->i_list);
-	entry->inode = NULL;
-
-	list_del_init(&entry->g_list);
-	entry->group = NULL;
-
-	fsnotify_put_mark(entry); /* for i_list and g_list */
+	hlist_del_init_rcu(&mark->i.i_list);
+	mark->i.inode = NULL;
 
 	/*
-	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
+	 * this mark is now off the inode->i_fsnotify_marks list and we
 	 * hold the inode->i_lock, so this is the perfect time to update the
 	 * inode->i_fsnotify_mask
 	 */
 	fsnotify_recalc_inode_mask_locked(inode);
 
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
-
-	/*
-	 * Some groups like to know that marks are being freed.  This is a
-	 * callback to the group function to let it know that this entry
-	 * is being freed.
-	 */
-	if (group->ops->freeing_mark)
-		group->ops->freeing_mark(entry, group);
-
-	/*
-	 * __fsnotify_update_child_dentry_flags(inode);
-	 *
-	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the entry->lock.
-	 *
-	 * The next time an event arrive to this inode from one of it's children
-	 * __fsnotify_parent will see that the inode doesn't care about it's
-	 * children and will update all of these flags then.  So really this
-	 * is just a lazy update (and could be a perf win...)
-	 */
-
-
-	iput(inode);
-
-	/*
-	 * it's possible that this group tried to destroy itself, but this
-	 * this mark was simultaneously being freed by inode.  If that's the
-	 * case, we finish freeing the group here.
-	 */
-	if (unlikely(atomic_dec_and_test(&group->num_marks)))
-		fsnotify_final_destroy_group(group);
-}
-
-/*
- * Given a group, destroy all of the marks associated with that group.
- */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
-{
-	struct fsnotify_mark_entry *lentry, *entry;
-	LIST_HEAD(free_list);
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
-		list_add(&entry->free_g_list, &free_list);
-		list_del_init(&entry->g_list);
-		fsnotify_get_mark(entry);
-	}
-	spin_unlock(&group->mark_lock);
-
-	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
-	}
 }
 
 /*
@@ -243,113 +85,151 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
  */
 void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry, *lentry;
+	struct fsnotify_mark *mark, *lmark;
 	struct hlist_node *pos, *n;
 	LIST_HEAD(free_list);
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
-		list_add(&entry->free_i_list, &free_list);
-		hlist_del_init(&entry->i_list);
-		fsnotify_get_mark(entry);
+	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
+		list_add(&mark->i.free_i_list, &free_list);
+		hlist_del_init_rcu(&mark->i.i_list);
+		fsnotify_get_mark(mark);
 	}
 	spin_unlock(&inode->i_lock);
 
-	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
+	list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
 	}
 }
 
 /*
+ * Given a group clear all of the inode marks associated with that group.
+ */
+void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+}
+
+/*
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
  */
-struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
-						     struct inode *inode)
+struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
+						      struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
-		if (entry->group == group) {
-			fsnotify_get_mark(entry);
-			return entry;
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
 		}
 	}
 	return NULL;
 }
 
 /*
- * Nothing fancy, just initialize lists and locks and counters.
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
  */
-void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
-			void (*free_mark)(struct fsnotify_mark_entry *entry))
+struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
+					       struct inode *inode)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&inode->i_lock);
+	mark = fsnotify_find_inode_mark_locked(group, inode);
+	spin_unlock(&inode->i_lock);
+
+	return mark;
+}
 
+/*
+ * If we are setting a mark mask on an inode mark we should pin the inode
+ * in memory.
+ */
+void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
+					 __u32 mask)
 {
-	spin_lock_init(&entry->lock);
-	atomic_set(&entry->refcnt, 1);
-	INIT_HLIST_NODE(&entry->i_list);
-	entry->group = NULL;
-	entry->mask = 0;
-	entry->inode = NULL;
-	entry->free_mark = free_mark;
+	struct inode *inode;
+
+	assert_spin_locked(&mark->lock);
+
+	if (mask &&
+	    mark->i.inode &&
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
+		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
+		inode = igrab(mark->i.inode);
+		/*
+		 * we shouldn't be able to get here if the inode wasn't
+		 * already safely held in memory.  But bug in case it
+		 * ever is wrong.
+		 */
+		BUG_ON(!inode);
+	}
 }
 
 /*
- * Attach an initialized mark entry to a given group and inode.
+ * Attach an initialized mark to a given inode.
  * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes.
+ * event types should be delivered to which group and for which inodes.  These
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
  */
-int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
-		      struct fsnotify_group *group, struct inode *inode)
+int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+			    struct fsnotify_group *group, struct inode *inode,
+			    int allow_dups)
 {
-	struct fsnotify_mark_entry *lentry;
+	struct fsnotify_mark *lmark;
+	struct hlist_node *node, *last = NULL;
 	int ret = 0;
 
-	inode = igrab(inode);
-	if (unlikely(!inode))
-		return -EINVAL;
+	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
 
-	/*
-	 * LOCKING ORDER!!!!
-	 * entry->lock
-	 * group->mark_lock
-	 * inode->i_lock
-	 */
-	spin_lock(&entry->lock);
-	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
-	lentry = fsnotify_find_mark_entry(group, inode);
-	if (!lentry) {
-		entry->group = group;
-		entry->inode = inode;
+	mark->i.inode = inode;
 
-		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
-		list_add(&entry->g_list, &group->mark_entries);
+	/* is mark the first mark? */
+	if (hlist_empty(&inode->i_fsnotify_marks)) {
+		hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
+		goto out;
+	}
 
-		fsnotify_get_mark(entry); /* for i_list and g_list */
+	/* should mark be in the middle of the current list? */
+	hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
+		last = node;
 
-		atomic_inc(&group->num_marks);
+		if ((lmark->group == group) && !allow_dups) {
+			ret = -EEXIST;
+			goto out;
+		}
 
-		fsnotify_recalc_inode_mask_locked(inode);
-	}
+		if (mark->group->priority < lmark->group->priority)
+			continue;
 
-	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
+			continue;
 
-	if (lentry) {
-		ret = -EEXIST;
-		iput(inode);
-		fsnotify_put_mark(lentry);
-	} else {
-		__fsnotify_update_child_dentry_flags(inode);
+		hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
+		goto out;
 	}
 
+	BUG_ON(last == NULL);
+	/* mark should be the last entry.  last is the current last entry */
+	hlist_add_after_rcu(last, &mark->i.i_list);
+out:
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+
 	return ret;
 }
 
@@ -365,15 +245,16 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
 	struct inode *inode, *next_i, *need_iput = NULL;
 
+	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 		struct inode *need_iput_tmp;
 
 		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 
 		/*
@@ -397,7 +278,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
 			__iget(next_i);
 			need_iput = next_i;
 		}
@@ -422,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 		spin_lock(&inode_lock);
 	}
+	spin_unlock(&inode_lock);
 }
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cf..b981fc0c837 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
-config INOTIFY
-	bool "Inotify file change notification support"
-	default n
-	---help---
-	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
-	  file change notification system.  It is a replacement for dnotify.
-	  This option only provides the legacy inotify in kernel API.  There
-	  are no in tree kernel users of this interface since it is deprecated.
-	  You only need this if you are loading an out of tree kernel module
-	  that uses inotify.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say N.
-
 config INOTIFY_USER
 	bool "Inotify support for userspace"
 	select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 94382817136..a380dabe09d 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 27b75ebc746..00000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * fs/inotify.c - inode-based file event notifications
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/writeback.h>
-#include <linux/inotify.h>
-#include <linux/fsnotify_backend.h>
-
-static atomic_t inotify_cookie;
-
-/*
- * Lock ordering:
- *
- * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_mutex (synchronize shrink_icache_memory())
- * 	inode_lock (protects the super_block->s_inodes list)
- * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
- *
- * The inode->inotify_mutex and inotify_handle->mutex and held during execution
- * of a caller's event handler.  Thus, the caller must not hold any locks
- * taken in their event handler while calling any of the published inotify
- * interfaces.
- */
-
-/*
- * Lifetimes of the three main data structures--inotify_handle, inode, and
- * inotify_watch--are managed by reference count.
- *
- * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * Additional references can bump the count via get_inotify_handle() and drop
- * the count via put_inotify_handle().
- *
- * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * to remove_watch_no_event().  Additional references can bump the count via
- * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
- * is reponsible for the final put after receiving IN_IGNORED, or when using
- * IN_ONESHOT after receiving the first event.  Inotify does the final put if
- * inotify_destroy() is called.
- *
- * inode: Pinned so long as the inode is associated with a watch, from
- * inotify_add_watch() to the final put_inotify_watch().
- */
-
-/*
- * struct inotify_handle - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_handle {
-	struct idr		idr;		/* idr mapping wd -> watch */
-	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head	watches;	/* list of watches */
-	atomic_t		count;		/* reference count */
-	u32			last_wd;	/* the last wd allocated */
-	const struct inotify_operations *in_ops; /* inotify caller operations */
-};
-
-static inline void get_inotify_handle(struct inotify_handle *ih)
-{
-	atomic_inc(&ih->count);
-}
-
-static inline void put_inotify_handle(struct inotify_handle *ih)
-{
-	if (atomic_dec_and_test(&ih->count)) {
-		idr_destroy(&ih->idr);
-		kfree(ih);
-	}
-}
-
-/**
- * get_inotify_watch - grab a reference to an inotify_watch
- * @watch: watch to grab
- */
-void get_inotify_watch(struct inotify_watch *watch)
-{
-	atomic_inc(&watch->count);
-}
-EXPORT_SYMBOL_GPL(get_inotify_watch);
-
-int pin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	if (atomic_inc_not_zero(&sb->s_active)) {
-		atomic_inc(&watch->count);
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * watch references if the count reaches zero.  inotify_watch is freed by
- * inotify callers via the destroy_watch() op.
- * @watch: watch to release
- */
-void put_inotify_watch(struct inotify_watch *watch)
-{
-	if (atomic_dec_and_test(&watch->count)) {
-		struct inotify_handle *ih = watch->ih;
-
-		iput(watch->inode);
-		ih->in_ops->destroy_watch(watch);
-		put_inotify_handle(ih);
-	}
-}
-EXPORT_SYMBOL_GPL(put_inotify_watch);
-
-void unpin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/*
- * inotify_handle_get_wd - returns the next WD for use by the given handle
- *
- * Callers must hold ih->mutex.  This function can sleep.
- */
-static int inotify_handle_get_wd(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	int ret;
-
-	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
-			return -ENOSPC;
-		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
-	} while (ret == -EAGAIN);
-
-	if (likely(!ret))
-		ih->last_wd = watch->wd;
-
-	return ret;
-}
-
-/*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
- */
-static inline int inotify_inode_watched(struct inode *inode)
-{
-	return !list_empty(&inode->inotify_watches);
-}
-
-/*
- * Get child dentry flag into synch with parent inode.
- * Flag should always be clear for negative dentrys.
- */
-static void set_dentry_child_flags(struct inode *inode, int watched)
-{
-	struct dentry *alias;
-
-	spin_lock(&dcache_lock);
-	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-		struct dentry *child;
-
-		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-			if (!child->d_inode)
-				continue;
-
-			spin_lock(&child->d_lock);
-			if (watched)
-				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
-		}
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * inotify_find_handle - find the watch associated with the given inode and
- * handle
- *
- * Callers must hold inode->inotify_mutex.
- */
-static struct inotify_watch *inode_find_handle(struct inode *inode,
-					       struct inotify_handle *ih)
-{
-	struct inotify_watch *watch;
-
-	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->ih == ih)
-			return watch;
-	}
-
-	return NULL;
-}
-
-/*
- * remove_watch_no_event - remove watch without the IN_IGNORED event.
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_handle *ih)
-{
-	list_del(&watch->i_list);
-	list_del(&watch->h_list);
-
-	if (!inotify_inode_watched(watch->inode))
-		set_dentry_child_flags(watch->inode, 0);
-
-	idr_remove(&ih->idr, watch->wd);
-}
-
-/**
- * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * watched.  May be invoked from a caller's event handler.
- * @ih: inotify handle associated with watch
- * @watch: watch to remove
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-void inotify_remove_watch_locked(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-
-/* Kernel API for producing events */
-
-/*
- * inotify_d_instantiate - instantiate dcache entry for inode
- */
-void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
-{
-	struct dentry *parent;
-
-	if (!inode)
-		return;
-
-	spin_lock(&entry->d_lock);
-	parent = entry->d_parent;
-	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	spin_unlock(&entry->d_lock);
-}
-
-/*
- * inotify_d_move - dcache entry has been moved
- */
-void inotify_d_move(struct dentry *entry)
-{
-	struct dentry *parent;
-
-	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	else
-		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
-}
-
-/**
- * inotify_inode_queue_event - queue an event to all watches on this inode
- * @inode: inode event is originating from
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- * @n_inode: inode associated with name
- */
-void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name, struct inode *n_inode)
-{
-	struct inotify_watch *watch, *next;
-
-	if (!inotify_inode_watched(inode))
-		return;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		u32 watch_mask = watch->mask;
-		if (watch_mask & mask) {
-			struct inotify_handle *ih= watch->ih;
-			mutex_lock(&ih->mutex);
-			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-						 name, n_inode);
-			mutex_unlock(&ih->mutex);
-		}
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
-
-/**
- * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
- * @dentry: the dentry in question, we queue against this dentry's parent
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- */
-void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
-				       u32 cookie, const char *name)
-{
-	struct dentry *parent;
-	struct inode *inode;
-
-	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	inode = parent->d_inode;
-
-	if (inotify_inode_watched(inode)) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name,
-					  dentry->d_inode);
-		dput(parent);
-	} else
-		spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
-
-/**
- * inotify_get_cookie - return a unique cookie for use in synchronizing events.
- */
-u32 inotify_get_cookie(void)
-{
-	return atomic_inc_return(&inotify_cookie);
-}
-EXPORT_SYMBOL_GPL(inotify_get_cookie);
-
-/**
- * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void inotify_unmount_inodes(struct list_head *list)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-		struct inotify_watch *watch, *next_w;
-		struct inode *need_iput_tmp;
-		struct list_head *watches;
-
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-		/* In case inotify_remove_watch_locked() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
-		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
-		}
-
-		/*
-		 * We can safely drop inode_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
-		 */
-		spin_unlock(&inode_lock);
-
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
-
-		/* for each watch, send IN_UNMOUNT and then remove it */
-		mutex_lock(&inode->inotify_mutex);
-		watches = &inode->inotify_watches;
-		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_handle *ih= watch->ih;
-			get_inotify_watch(watch);
-			mutex_lock(&ih->mutex);
-			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL, NULL);
-			inotify_remove_watch_locked(ih, watch);
-			mutex_unlock(&ih->mutex);
-			put_inotify_watch(watch);
-		}
-		mutex_unlock(&inode->inotify_mutex);
-		iput(inode);		
-
-		spin_lock(&inode_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
-
-/**
- * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
- * @inode: inode that is about to be removed
- */
-void inotify_inode_is_dead(struct inode *inode)
-{
-	struct inotify_watch *watch, *next;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_handle *ih = watch->ih;
-		mutex_lock(&ih->mutex);
-		inotify_remove_watch_locked(ih, watch);
-		mutex_unlock(&ih->mutex);
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-
-/* Kernel Consumer API */
-
-/**
- * inotify_init - allocate and initialize an inotify instance
- * @ops: caller's inotify operations
- */
-struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-	struct inotify_handle *ih;
-
-	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-	if (unlikely(!ih))
-		return ERR_PTR(-ENOMEM);
-
-	idr_init(&ih->idr);
-	INIT_LIST_HEAD(&ih->watches);
-	mutex_init(&ih->mutex);
-	ih->last_wd = 0;
-	ih->in_ops = ops;
-	atomic_set(&ih->count, 0);
-	get_inotify_handle(ih);
-
-	return ih;
-}
-EXPORT_SYMBOL_GPL(inotify_init);
-
-/**
- * inotify_init_watch - initialize an inotify watch
- * @watch: watch to initialize
- */
-void inotify_init_watch(struct inotify_watch *watch)
-{
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-	atomic_set(&watch->count, 0);
-	get_inotify_watch(watch); /* initial get */
-}
-EXPORT_SYMBOL_GPL(inotify_init_watch);
-
-/*
- * Watch removals suck violently.  To kick the watch out we need (in this
- * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
- * a hold on inode; however, for all other cases we need to make damn sure
- * we don't race with umount.  We can *NOT* just grab a reference to a
- * watch - inotify_unmount_inodes() will happily sail past it and we'll end
- * with reference to inode potentially outliving its superblock.  Ideally
- * we just want to grab an active reference to superblock if we can; that
- * will make sure we won't go into inotify_umount_inodes() until we are
- * done.  Cleanup is just deactivate_super().  However, that leaves a messy
- * case - what if we *are* racing with umount() and active references to
- * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will wait until the superblock is shut down and the
- * watch in question is pining for fjords.
- *
- * And yes, this is far beyond mere "not very pretty"; so's the entire
- * concept of inotify to start with.
- */
-
-/**
- * pin_to_kill - pin the watch down for removal
- * @ih: inotify handle
- * @watch: watch to kill
- *
- * Called with ih->mutex held, drops it.  Possible return values:
- * 0 - nothing to do, it has died
- * 1 - remove it, drop the reference and deactivate_super()
- */
-static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-
-	if (atomic_inc_not_zero(&sb->s_active)) {
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
-		return 1;	/* the best outcome */
-	}
-	spin_lock(&sb_lock);
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
-	down_read(&sb->s_umount);
-	/* fs is already shut down; the watch is dead */
-	drop_super(sb);
-	return 0;
-}
-
-static void unpin_and_kill(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/**
- * inotify_destroy - clean up and destroy an inotify instance
- * @ih: inotify handle
- */
-void inotify_destroy(struct inotify_handle *ih)
-{
-	/*
-	 * Destroy all of the watches for this handle. Unfortunately, not very
-	 * pretty.  We cannot do a simple iteration over the list, because we
-	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before ih->mutex.  The following works.
-	 *
-	 * AV: it had to become even uglier to start working ;-/
-	 */
-	while (1) {
-		struct inotify_watch *watch;
-		struct list_head *watches;
-		struct super_block *sb;
-		struct inode *inode;
-
-		mutex_lock(&ih->mutex);
-		watches = &ih->watches;
-		if (list_empty(watches)) {
-			mutex_unlock(&ih->mutex);
-			break;
-		}
-		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		sb = watch->inode->i_sb;
-		if (!pin_to_kill(ih, watch))
-			continue;
-
-		inode = watch->inode;
-		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&ih->mutex);
-
-		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&ih->idr, watch->wd))) {
-			remove_watch_no_event(watch, ih);
-			put_inotify_watch(watch);
-		}
-
-		mutex_unlock(&ih->mutex);
-		mutex_unlock(&inode->inotify_mutex);
-		unpin_and_kill(watch);
-	}
-
-	/* free this handle: the put matching the get in inotify_init() */
-	put_inotify_handle(ih);
-}
-EXPORT_SYMBOL_GPL(inotify_destroy);
-
-/**
- * inotify_find_watch - find an existing watch for an (ih,inode) pair
- * @ih: inotify handle
- * @inode: inode to watch
- * @watchp: pointer to existing inotify_watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-		       struct inotify_watch **watchp)
-{
-	struct inotify_watch *old;
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	old = inode_find_handle(inode, ih);
-	if (unlikely(old)) {
-		get_inotify_watch(old); /* caller must put watch */
-		*watchp = old;
-		ret = old->wd;
-	}
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_watch);
-
-/**
- * inotify_find_update_watch - find and update the mask of an existing watch
- * @ih: inotify handle
- * @inode: inode's watch to update
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
-			      u32 mask)
-{
-	struct inotify_watch *old;
-	int mask_add = 0;
-	int ret;
-
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/*
-	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
-	 * are already watching.  We just update the mask and return its wd.
-	 */
-	old = inode_find_handle(inode, ih);
-	if (unlikely(!old)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (mask_add)
-		old->mask |= mask;
-	else
-		old->mask = mask;
-	ret = old->wd;
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-
-/**
- * inotify_add_watch - add a watch to an inotify instance
- * @ih: inotify handle
- * @watch: caller allocated watch structure
- * @inode: inode to watch
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- * Caller must ensure it only calls inotify_add_watch() once per watch.
- * Calls inotify_handle_get_wd() so may sleep.
- */
-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
-		      struct inode *inode, u32 mask)
-{
-	int ret = 0;
-	int newly_watched;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-	watch->mask = mask;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, watch);
-	if (unlikely(ret))
-		goto out;
-	ret = watch->wd;
-
-	/* save a reference to handle and bump the count to make it official */
-	get_inotify_handle(ih);
-	watch->ih = ih;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* Add the watch to the handle's and the inode's list */
-	newly_watched = !inotify_inode_watched(inode);
-	list_add(&watch->h_list, &ih->watches);
-	list_add(&watch->i_list, &inode->inotify_watches);
-	/*
-	 * Set child flags _after_ adding the watch, so there is no race
-	 * windows where newly instantiated children could miss their parent's
-	 * watched flag.
-	 */
-	if (newly_watched)
-		set_dentry_child_flags(inode, 1);
-
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_add_watch);
-
-/**
- * inotify_clone_watch - put the watch next to existing one
- * @old: already installed watch
- * @new: new watch
- *
- * Caller must hold the inotify_mutex of inode we are dealing with;
- * it is expected to remove the old watch before unlocking the inode.
- */
-s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
-{
-	struct inotify_handle *ih = old->ih;
-	int ret = 0;
-
-	new->mask = old->mask;
-	new->ih = ih;
-
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, new);
-	if (unlikely(ret))
-		goto out;
-	ret = new->wd;
-
-	get_inotify_handle(ih);
-
-	new->inode = igrab(old->inode);
-
-	list_add(&new->h_list, &ih->watches);
-	list_add(&new->i_list, &old->inode->inotify_watches);
-out:
-	mutex_unlock(&ih->mutex);
-	return ret;
-}
-
-void inotify_evict_watch(struct inotify_watch *watch)
-{
-	get_inotify_watch(watch);
-	mutex_lock(&watch->ih->mutex);
-	inotify_remove_watch_locked(watch->ih, watch);
-	mutex_unlock(&watch->ih->mutex);
-}
-
-/**
- * inotify_rm_wd - remove a watch from an inotify instance
- * @ih: inotify handle
- * @wd: watch descriptor to remove
- *
- * Can sleep.
- */
-int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
-{
-	struct inotify_watch *watch;
-	struct super_block *sb;
-	struct inode *inode;
-
-	mutex_lock(&ih->mutex);
-	watch = idr_find(&ih->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&ih->mutex);
-		return -EINVAL;
-	}
-	sb = watch->inode->i_sb;
-	if (!pin_to_kill(ih, watch))
-		return 0;
-
-	inode = watch->inode;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&ih->idr, wd) == watch))
-		inotify_remove_watch_locked(ih, watch);
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	unpin_and_kill(watch);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(inotify_rm_wd);
-
-/**
- * inotify_rm_watch - remove a watch from an inotify instance
- * @ih: inotify handle
- * @watch: watch to remove
- *
- * Can sleep.
- */
-int inotify_rm_watch(struct inotify_handle *ih,
-		     struct inotify_watch *watch)
-{
-	return inotify_rm_wd(ih, watch->wd);
-}
-EXPORT_SYMBOL_GPL(inotify_rm_watch);
-
-/*
- * inotify_setup - core initialization function
- */
-static int __init inotify_setup(void)
-{
-	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
-	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
-	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
-	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
-	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
-	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
-	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
-	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
-	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
-	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
-	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
-	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
-
-	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
-	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
-	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
-	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
-
-	atomic_set(&inotify_cookie, 0);
-
-	return 0;
-}
-
-module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8c..b6642e4de4b 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
 	int wd;
 };
 
-struct inotify_inode_mark_entry {
-	/* fsnotify_mark_entry MUST be the first thing */
-	struct fsnotify_mark_entry fsn_entry;
+struct inotify_inode_mark {
+	struct fsnotify_mark fsn_mark;
 	int wd;
 };
 
-extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76a..a91b69a6a29 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
  * General Public License for more details.
  */
 
+#include <linux/dcache.h> /* d_unlinked */
 #include <linux/fs.h> /* struct inode */
 #include <linux/fsnotify_backend.h>
 #include <linux/inotify.h>
@@ -32,26 +33,84 @@
 
 #include "inotify.h"
 
-static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type) &&
+	    (old->name_len == new->name_len)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			/* remember, after old was put on the wait_q we aren't
+			 * allowed to look at the inode any more, only thing
+			 * left to check was if the file_name is the same */
+			if (!old->name_len ||
+			    !strcmp(old->file_name, new->file_name))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_NONE):
+			if (old->mask & FS_Q_OVERFLOW)
+				return true;
+			else if (old->mask & FS_IN_IGNORED)
+				return false;
+			return true;
+		};
+	}
+	return false;
+}
+
+static struct fsnotify_event *inotify_merge(struct list_head *list,
+					    struct fsnotify_event *event)
 {
-	struct fsnotify_mark_entry *entry;
-	struct inotify_inode_mark_entry *ientry;
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+
+	/* and the list better be locked by something too */
+	spin_lock(&event->lock);
+
+	last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+	last_event = last_holder->event;
+	if (event_compare(last_event, event))
+		fsnotify_get_event(last_event);
+	else
+		last_event = NULL;
+
+	spin_unlock(&event->lock);
+
+	return last_event;
+}
+
+static int inotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				struct fsnotify_event *event)
+{
+	struct inotify_inode_mark *i_mark;
 	struct inode *to_tell;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	int wd, ret;
+	struct fsnotify_event *added_event;
+	int wd, ret = 0;
+
+	BUG_ON(vfsmount_mark);
+
+	pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
+		 event, event->to_tell, event->mask);
 
 	to_tell = event->to_tell;
 
-	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
-	spin_unlock(&to_tell->i_lock);
-	/* race with watch removal?  We already passes should_send */
-	if (unlikely(!entry))
-		return 0;
-	ientry = container_of(entry, struct inotify_inode_mark_entry,
-			      fsn_entry);
-	wd = ientry->wd;
+	i_mark = container_of(inode_mark, struct inotify_inode_mark,
+			      fsn_mark);
+	wd = i_mark->wd;
 
 	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
 	if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	fsn_event_priv->group = group;
 	event_priv->wd = wd;
 
-	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-	if (ret) {
+	added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+	if (added_event) {
 		inotify_free_event_priv(fsn_event_priv);
-		/* EEXIST says we tail matched, EOVERFLOW isn't something
-		 * to report up the stack. */
-		if ((ret == -EEXIST) ||
-		    (ret == -EOVERFLOW))
-			ret = 0;
+		if (!IS_ERR(added_event))
+			fsnotify_put_event(added_event);
+		else
+			ret = PTR_ERR(added_event);
 	}
 
-	/*
-	 * If we hold the entry until after the event is on the queue
-	 * IN_IGNORED won't be able to pass this event in the queue
-	 */
-	fsnotify_put_mark(entry);
+	if (inode_mark->mask & IN_ONESHOT)
+		fsnotify_destroy_mark(inode_mark);
 
 	return ret;
 }
 
-static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
 {
-	inotify_ignored_and_remove_idr(entry, group);
+	inotify_ignored_and_remove_idr(fsn_mark, group);
 }
 
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
+				      struct fsnotify_mark *inode_mark,
+				      struct fsnotify_mark *vfsmount_mark,
+				      __u32 mask, void *data, int data_type)
 {
-	struct fsnotify_mark_entry *entry;
-	bool send;
-
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-	if (!entry)
-		return false;
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path = data;
 
-	mask = (mask & ~FS_EVENT_ON_CHILD);
-	send = (entry->mask & mask);
-
-	/* find took a reference */
-	fsnotify_put_mark(entry);
+		if (d_unlinked(path->dentry))
+			return false;
+	}
 
-	return send;
+	return true;
 }
 
 /*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
  */
 static int idr_callback(int id, void *p, void *data)
 {
-	struct fsnotify_mark_entry *entry;
-	struct inotify_inode_mark_entry *ientry;
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
 	static bool warned = false;
 
 	if (warned)
 		return 0;
 
 	warned = true;
-	entry = p;
-	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	fsn_mark = p;
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
-	WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+	WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
 		"idr.  Probably leaking memory\n", id, p, data);
 
 	/*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
 	 * out why we got here and the panic is no worse than the original
 	 * BUG() that was here.
 	 */
-	if (entry)
-		printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
-			entry->group, entry->inode, ientry->wd);
+	if (fsn_mark)
+		printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
+			fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
 	return 0;
 }
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9b..444c305a468 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
-int inotify_max_user_watches __read_mostly;
+static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
 
-/*
- * When inotify registers a new group it increments this and uses that
- * value as an offset to set the fsnotify group "name" and priority.
- */
-static atomic_t inotify_grp_num;
-
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
 {
 	__u32 mask;
 
-	/* everything should accept their own ignored and cares about children */
-	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
+	/*
+	 * everything should accept their own ignored, cares about children,
+	 * and should receive events when the inode is unmounted
+	 */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
 
 	/* mask off the flags used to open the fd */
-	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
 
 	return mask;
 }
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 
 	event = fsnotify_peek_notify_event(group);
 
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
 	if (event->name_len)
 		event_size += roundup(event->name_len + 1, event_size);
 
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	size_t event_size = sizeof(struct inotify_event);
 	size_t name_len = 0;
 
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
 	/* we get the inotify watch descriptor from the event private data */
 	spin_lock(&event->lock);
 	fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 		kevent = get_one_event(group, count);
 		mutex_unlock(&group->notification_mutex);
 
+		pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
+
 		if (kevent) {
 			ret = PTR_ERR(kevent);
 			if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
 	struct fsnotify_group *group = file->private_data;
 	struct user_struct *user = group->inotify_data.user;
 
+	pr_debug("%s: group=%p\n", __func__, group);
+
 	fsnotify_clear_marks_by_group(group);
 
 	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 	group = file->private_data;
 	p = (void __user *) arg;
 
+	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
+
 	switch (cmd) {
 	case FIONREAD:
 		mutex_lock(&group->notification_mutex);
@@ -337,6 +344,7 @@ static const struct file_operations inotify_fops = {
 	.release	= inotify_release,
 	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
+	.llseek		= noop_llseek,
 };
 
 
@@ -357,59 +365,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	return error;
 }
 
+static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
+			      int *last_wd,
+			      struct inotify_inode_mark *i_mark)
+{
+	int ret;
+
+	do {
+		if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock(idr_lock);
+		ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
+					&i_mark->wd);
+		/* we added the mark to the idr, take a reference */
+		if (!ret) {
+			*last_wd = i_mark->wd;
+			fsnotify_get_mark(&i_mark->fsn_mark);
+		}
+		spin_unlock(idr_lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
+								int wd)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *i_mark;
+
+	assert_spin_locked(idr_lock);
+
+	i_mark = idr_find(idr, wd);
+	if (i_mark) {
+		struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
+
+		fsnotify_get_mark(fsn_mark);
+		/* One ref for being in the idr, one ref we just took */
+		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+	}
+
+	return i_mark;
+}
+
+static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
+							 int wd)
+{
+	struct inotify_inode_mark *i_mark;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+	spin_lock(idr_lock);
+	i_mark = inotify_idr_find_locked(group, wd);
+	spin_unlock(idr_lock);
+
+	return i_mark;
+}
+
+static void do_inotify_remove_from_idr(struct fsnotify_group *group,
+				       struct inotify_inode_mark *i_mark)
+{
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	int wd = i_mark->wd;
+
+	assert_spin_locked(idr_lock);
+
+	idr_remove(idr, wd);
+
+	/* removed from the idr, drop that ref */
+	fsnotify_put_mark(&i_mark->fsn_mark);
+}
+
 /*
  * Remove the mark from the idr (if present) and drop the reference
  * on the mark because it was in the idr.
  */
 static void inotify_remove_from_idr(struct fsnotify_group *group,
-				    struct inotify_inode_mark_entry *ientry)
+				    struct inotify_inode_mark *i_mark)
 {
-	struct idr *idr;
-	struct fsnotify_mark_entry *entry;
-	struct inotify_inode_mark_entry *found_ientry;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+	struct inotify_inode_mark *found_i_mark = NULL;
 	int wd;
 
-	spin_lock(&group->inotify_data.idr_lock);
-	idr = &group->inotify_data.idr;
-	wd = ientry->wd;
+	spin_lock(idr_lock);
+	wd = i_mark->wd;
 
-	if (wd == -1)
+	/*
+	 * does this i_mark think it is in the idr?  we shouldn't get called
+	 * if it wasn't....
+	 */
+	if (wd == -1) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
 		goto out;
+	}
 
-	entry = idr_find(&group->inotify_data.idr, wd);
-	if (unlikely(!entry))
+	/* Lets look in the idr to see if we find it */
+	found_i_mark = inotify_idr_find_locked(group, wd);
+	if (unlikely(!found_i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
 		goto out;
+	}
 
-	found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-	if (unlikely(found_ientry != ientry)) {
-		/* We found an entry in the idr with the right wd, but it's
-		 * not the entry we were told to remove.  eparis seriously
-		 * fucked up somewhere. */
-		WARN_ON(1);
-		ientry->wd = -1;
+	/*
+	 * We found an mark in the idr at the right wd, but it's
+	 * not the mark we were told to remove.  eparis seriously
+	 * fucked up somewhere.
+	 */
+	if (unlikely(found_i_mark != i_mark)) {
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
+			"mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
+			"found_i_mark->group=%p found_i_mark->inode=%p\n",
+			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
+			i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
+			found_i_mark->fsn_mark.group,
+			found_i_mark->fsn_mark.i.inode);
 		goto out;
 	}
 
-	/* One ref for being in the idr, one ref held by the caller */
-	BUG_ON(atomic_read(&entry->refcnt) < 2);
-
-	idr_remove(idr, wd);
-	ientry->wd = -1;
+	/*
+	 * One ref for being in the idr
+	 * one ref held by the caller trying to kill us
+	 * one ref grabbed by inotify_idr_find
+	 */
+	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
+		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+			i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+		/* we can't really recover with bad ref cnting.. */
+		BUG();
+	}
 
-	/* removed from the idr, drop that ref */
-	fsnotify_put_mark(entry);
+	do_inotify_remove_from_idr(group, i_mark);
 out:
-	spin_unlock(&group->inotify_data.idr_lock);
+	/* match the ref taken by inotify_idr_find_locked() */
+	if (found_i_mark)
+		fsnotify_put_mark(&found_i_mark->fsn_mark);
+	i_mark->wd = -1;
+	spin_unlock(idr_lock);
 }
 
 /*
  * Send IN_IGNORED for this wd, remove this wd from the idr.
  */
-void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 				    struct fsnotify_group *group)
 {
-	struct inotify_inode_mark_entry *ientry;
-	struct fsnotify_event *ignored_event;
+	struct inotify_inode_mark *i_mark;
+	struct fsnotify_event *ignored_event, *notify_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
 	int ret;
@@ -420,7 +528,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	if (!ignored_event)
 		return;
 
-	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
 	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
 	if (unlikely(!event_priv))
@@ -429,37 +537,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
 
 	fsn_event_priv->group = group;
-	event_priv->wd = ientry->wd;
-
-	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
-	if (ret)
+	event_priv->wd = i_mark->wd;
+
+	notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
+	if (notify_event) {
+		if (IS_ERR(notify_event))
+			ret = PTR_ERR(notify_event);
+		else
+			fsnotify_put_event(notify_event);
 		inotify_free_event_priv(fsn_event_priv);
+	}
 
 skip_send_ignore:
 
 	/* matches the reference taken when the event was created */
 	fsnotify_put_event(ignored_event);
 
-	/* remove this entry from the idr */
-	inotify_remove_from_idr(group, ientry);
+	/* remove this mark from the idr */
+	inotify_remove_from_idr(group, i_mark);
 
 	atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 
 /* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+	struct inotify_inode_mark *i_mark;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
-	kmem_cache_free(inotify_inode_mark_cachep, ientry);
+	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
 }
 
 static int inotify_update_existing_watch(struct fsnotify_group *group,
 					 struct inode *inode,
 					 u32 arg)
 {
-	struct fsnotify_mark_entry *entry;
-	struct inotify_inode_mark_entry *ientry;
+	struct fsnotify_mark *fsn_mark;
+	struct inotify_inode_mark *i_mark;
 	__u32 old_mask, new_mask;
 	__u32 mask;
 	int add = (arg & IN_MASK_ADD);
@@ -467,52 +582,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 
 	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
-	if (unlikely(!mask))
+	if (unlikely(!(mask & IN_ALL_EVENTS)))
 		return -EINVAL;
 
-	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
-	spin_unlock(&inode->i_lock);
-	if (!entry)
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	if (!fsn_mark)
 		return -ENOENT;
 
-	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
-	spin_lock(&entry->lock);
+	spin_lock(&fsn_mark->lock);
 
-	old_mask = entry->mask;
-	if (add) {
-		entry->mask |= mask;
-		new_mask = entry->mask;
-	} else {
-		entry->mask = mask;
-		new_mask = entry->mask;
-	}
+	old_mask = fsn_mark->mask;
+	if (add)
+		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+	else
+		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+	new_mask = fsn_mark->mask;
 
-	spin_unlock(&entry->lock);
+	spin_unlock(&fsn_mark->lock);
 
 	if (old_mask != new_mask) {
 		/* more bits in old than in new? */
 		int dropped = (old_mask & ~new_mask);
-		/* more bits in this entry than the inode's mask? */
+		/* more bits in this fsn_mark than the inode's mask? */
 		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
-		/* more bits in this entry than the group? */
-		int do_group = (new_mask & ~group->mask);
 
-		/* update the inode with this new entry */
+		/* update the inode with this new fsn_mark */
 		if (dropped || do_inode)
 			fsnotify_recalc_inode_mask(inode);
 
-		/* update the group mask with the new mask */
-		if (dropped || do_group)
-			fsnotify_recalc_group_mask(group);
 	}
 
 	/* return the wd */
-	ret = ientry->wd;
+	ret = i_mark->wd;
 
-	/* match the get from fsnotify_find_mark_entry() */
-	fsnotify_put_mark(entry);
+	/* match the get from fsnotify_find_mark() */
+	fsnotify_put_mark(fsn_mark);
 
 	return ret;
 }
@@ -521,73 +627,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
 			     struct inode *inode,
 			     u32 arg)
 {
-	struct inotify_inode_mark_entry *tmp_ientry;
+	struct inotify_inode_mark *tmp_i_mark;
 	__u32 mask;
 	int ret;
+	struct idr *idr = &group->inotify_data.idr;
+	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
 
 	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
-	if (unlikely(!mask))
+	if (unlikely(!(mask & IN_ALL_EVENTS)))
 		return -EINVAL;
 
-	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!tmp_ientry))
+	tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_i_mark))
 		return -ENOMEM;
 
-	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-	tmp_ientry->fsn_entry.mask = mask;
-	tmp_ientry->wd = -1;
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+	tmp_i_mark->fsn_mark.mask = mask;
+	tmp_i_mark->wd = -1;
 
 	ret = -ENOSPC;
 	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
 		goto out_err;
-retry:
-	ret = -ENOMEM;
-	if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-		goto out_err;
 
-	/* we are putting the mark on the idr, take a reference */
-	fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
-	spin_lock(&group->inotify_data.idr_lock);
-	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-				group->inotify_data.last_wd+1,
-				&tmp_ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
-	if (ret) {
-		/* we didn't get on the idr, drop the idr reference */
-		fsnotify_put_mark(&tmp_ientry->fsn_entry);
-
-		/* idr was out of memory allocate and try again */
-		if (ret == -EAGAIN)
-			goto retry;
+	ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
+				 tmp_i_mark);
+	if (ret)
 		goto out_err;
-	}
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
-		inotify_remove_from_idr(group, tmp_ientry);
+		inotify_remove_from_idr(group, tmp_i_mark);
 		goto out_err;
 	}
 
-	/* update the idr hint, who cares about races, it's just a hint */
-	group->inotify_data.last_wd = tmp_ientry->wd;
-
 	/* increment the number of watches the user has */
 	atomic_inc(&group->inotify_data.user->inotify_watches);
 
-	/* return the watch descriptor for this new entry */
-	ret = tmp_ientry->wd;
-
-	/* if this mark added a new event update the group mask */
-	if (mask & ~group->mask)
-		fsnotify_recalc_group_mask(group);
+	/* return the watch descriptor for this new mark */
+	ret = tmp_i_mark->wd;
 
 out_err:
-	/* match the ref from fsnotify_init_markentry() */
-	fsnotify_put_mark(&tmp_ientry->fsn_entry);
+	/* match the ref from fsnotify_init_mark() */
+	fsnotify_put_mark(&tmp_i_mark->fsn_mark);
 
 	return ret;
 }
@@ -616,11 +700,8 @@ retry:
 static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
 {
 	struct fsnotify_group *group;
-	unsigned int grp_num;
 
-	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
-	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
@@ -726,7 +807,7 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
 	struct fsnotify_group *group;
-	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark *i_mark;
 	struct file *filp;
 	int ret = 0, fput_needed;
 
@@ -735,25 +816,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 		return -EBADF;
 
 	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
+	ret = -EINVAL;
+	if (unlikely(filp->f_op != &inotify_fops))
 		goto out;
-	}
 
 	group = filp->private_data;
 
-	spin_lock(&group->inotify_data.idr_lock);
-	entry = idr_find(&group->inotify_data.idr, wd);
-	if (unlikely(!entry)) {
-		spin_unlock(&group->inotify_data.idr_lock);
-		ret = -EINVAL;
+	ret = -EINVAL;
+	i_mark = inotify_idr_find(group, wd);
+	if (unlikely(!i_mark))
 		goto out;
-	}
-	fsnotify_get_mark(entry);
-	spin_unlock(&group->inotify_data.idr_lock);
 
-	fsnotify_destroy_mark_by_entry(entry);
-	fsnotify_put_mark(entry);
+	ret = 0;
+
+	fsnotify_destroy_mark(&i_mark->fsn_mark);
+
+	/* match ref taken by inotify_idr_find */
+	fsnotify_put_mark(&i_mark->fsn_mark);
 
 out:
 	fput_light(filp, fput_needed);
@@ -767,7 +846,28 @@ out:
  */
 static int __init inotify_user_setup(void)
 {
-	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
+	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 00000000000..325185e514b
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * mark->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the marks_list anchored inside a given group
+ * and each mark is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
+ * given inode and each mark is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/writeback.h> /* for inode_lock */
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
+void fsnotify_get_mark(struct fsnotify_mark *mark)
+{
+	atomic_inc(&mark->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+	if (atomic_dec_and_test(&mark->refcnt))
+		mark->free_mark(mark);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the mark->lock
+ */
+void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group;
+	struct inode *inode = NULL;
+
+	spin_lock(&mark->lock);
+
+	group = mark->group;
+
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+
+	/* 1 from caller and 1 for being on i_list/g_list */
+	BUG_ON(atomic_read(&mark->refcnt) < 2);
+
+	spin_lock(&group->mark_lock);
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+		inode = mark->i.inode;
+		fsnotify_destroy_inode_mark(mark);
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+		fsnotify_destroy_vfsmount_mark(mark);
+	else
+		BUG();
+
+	list_del_init(&mark->g_list);
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this mark
+	 * is being freed.
+	 */
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(mark, group);
+
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the mark->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+		iput(inode);
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->mask = mask;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+		fsnotify_set_inode_mark_mask_locked(mark, mask);
+}
+
+void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->ignored_mask = mask;
+}
+
+/*
+ * Attach an initialized mark to a given group and fs object.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group.
+ */
+int fsnotify_add_mark(struct fsnotify_mark *mark,
+		      struct fsnotify_group *group, struct inode *inode,
+		      struct vfsmount *mnt, int allow_dups)
+{
+	int ret = 0;
+
+	BUG_ON(inode && mnt);
+	BUG_ON(!inode && !mnt);
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * mark->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&mark->lock);
+	spin_lock(&group->mark_lock);
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+
+	mark->group = group;
+	list_add(&mark->g_list, &group->marks_list);
+	atomic_inc(&group->num_marks);
+	fsnotify_get_mark(mark); /* for i_list and g_list */
+
+	if (inode) {
+		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
+		if (ret)
+			goto err;
+	} else if (mnt) {
+		ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
+		if (ret)
+			goto err;
+	} else {
+		BUG();
+	}
+
+	spin_unlock(&group->mark_lock);
+
+	/* this will pin the object if appropriate */
+	fsnotify_set_mark_mask_locked(mark, mark->mask);
+
+	spin_unlock(&mark->lock);
+
+	if (inode)
+		__fsnotify_update_child_dentry_flags(inode);
+
+	return ret;
+err:
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	list_del_init(&mark->g_list);
+	mark->group = NULL;
+	atomic_dec(&group->num_marks);
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
+	return ret;
+}
+
+/*
+ * clear any marks in a group in which mark->flags & flags is true
+ */
+void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
+					 unsigned int flags)
+{
+	struct fsnotify_mark *lmark, *mark;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+		if (mark->flags & flags) {
+			list_add(&mark->free_g_list, &free_list);
+			list_del_init(&mark->g_list);
+			fsnotify_get_mark(mark);
+		}
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+}
+
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
+{
+	assert_spin_locked(&old->lock);
+	new->i.inode = old->i.inode;
+	new->m.mnt = old->m.mnt;
+	new->group = old->group;
+	new->mask = old->mask;
+	new->free_mark = old->free_mark;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+			void (*free_mark)(struct fsnotify_mark *mark))
+{
+	memset(mark, 0, sizeof(*mark));
+	spin_lock_init(&mark->lock);
+	atomic_set(&mark->refcnt, 1);
+	mark->free_mark = free_mark;
+}
+
+static int fsnotify_mark_destroy(void *ignored)
+{
+	struct fsnotify_mark *mark, *next;
+	LIST_HEAD(private_destroy_list);
+
+	for (;;) {
+		spin_lock(&destroy_lock);
+		/* exchange the list head */
+		list_replace_init(&destroy_list, &private_destroy_list);
+		spin_unlock(&destroy_lock);
+
+		synchronize_srcu(&fsnotify_mark_srcu);
+
+		list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
+			list_del_init(&mark->destroy_list);
+			fsnotify_put_mark(mark);
+		}
+
+		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+	}
+
+	return 0;
+}
+
+static int __init fsnotify_mark_init(void)
+{
+	struct task_struct *thread;
+
+	thread = kthread_run(fsnotify_mark_destroy, NULL,
+			     "fsnotify_mark");
+	if (IS_ERR(thread))
+		panic("unable to start fsnotify mark destruction thread.");
+
+	return 0;
+}
+device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c10..f39260f8f86 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -56,7 +56,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
  * it is needed.  It's refcnt is set 1 at kernel init time and will never
  * get set to 0 so it will never get 'freed'
  */
-static struct fsnotify_event q_overflow_event;
+static struct fsnotify_event *q_overflow_event;
 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
 
 /**
@@ -87,12 +87,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		return;
 
 	if (atomic_dec_and_test(&event->refcnt)) {
+		pr_debug("%s: event=%p\n", __func__, event);
+
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
 		BUG_ON(!list_empty(&event->private_data_list));
 
 		kfree(event->file_name);
+		put_pid(event->tgid);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
 }
@@ -104,7 +107,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
 
 void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 {
-	kmem_cache_free(fsnotify_event_holder_cachep, holder);
+	if (holder)
+		kmem_cache_free(fsnotify_event_holder_cachep, holder);
 }
 
 /*
@@ -129,53 +133,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
 }
 
 /*
- * Check if 2 events contain the same information.  We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
- */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
-{
-	if ((old->mask == new->mask) &&
-	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type) &&
-	    (old->name_len == new->name_len)) {
-		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_INODE):
-			/* remember, after old was put on the wait_q we aren't
-			 * allowed to look at the inode any more, only thing
-			 * left to check was if the file_name is the same */
-			if (!old->name_len ||
-			    !strcmp(old->file_name, new->file_name))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_NONE):
-			if (old->mask & FS_Q_OVERFLOW)
-				return true;
-			else if (old->mask & FS_IN_IGNORED)
-				return false;
-			return false;
-		};
-	}
-	return false;
-}
-
-/*
  * Add an event to the group notification queue.  The group can later pull this
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-			      struct fsnotify_event_private_data *priv)
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+						 struct fsnotify_event_private_data *priv,
+						 struct fsnotify_event *(*merge)(struct list_head *,
+										 struct fsnotify_event *))
 {
+	struct fsnotify_event *return_event = NULL;
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
-	struct fsnotify_event_holder *last_holder;
-	struct fsnotify_event *last_event;
-	int ret = 0;
+
+	pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
 
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +160,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 alloc_holder:
 		holder = fsnotify_alloc_event_holder();
 		if (!holder)
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 	}
 
 	mutex_lock(&group->notification_mutex);
 
 	if (group->q_len >= group->max_events) {
-		event = &q_overflow_event;
-		ret = -EOVERFLOW;
+		event = q_overflow_event;
+
+		/*
+		 * we need to return the overflow event
+		 * which means we need a ref
+		 */
+		fsnotify_get_event(event);
+		return_event = event;
+
 		/* sorry, no private data on the overflow event */
 		priv = NULL;
 	}
 
+	if (!list_empty(list) && merge) {
+		struct fsnotify_event *tmp;
+
+		tmp = merge(list, event);
+		if (tmp) {
+			mutex_unlock(&group->notification_mutex);
+
+			if (return_event)
+				fsnotify_put_event(return_event);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return tmp;
+		}
+	}
+
 	spin_lock(&event->lock);
 
 	if (list_empty(&event->holder.event_list)) {
@@ -212,19 +205,13 @@ alloc_holder:
 		 * event holder was used, go back and get a new one */
 		spin_unlock(&event->lock);
 		mutex_unlock(&group->notification_mutex);
-		goto alloc_holder;
-	}
 
-	if (!list_empty(list)) {
-		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-		last_event = last_holder->event;
-		if (event_compare(last_event, event)) {
-			spin_unlock(&event->lock);
-			mutex_unlock(&group->notification_mutex);
-			if (holder != &event->holder)
-				fsnotify_destroy_event_holder(holder);
-			return -EEXIST;
+		if (return_event) {
+			fsnotify_put_event(return_event);
+			return_event = NULL;
 		}
+
+		goto alloc_holder;
 	}
 
 	group->q_len++;
@@ -238,7 +225,7 @@ alloc_holder:
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
-	return ret;
+	return return_event;
 }
 
 /*
@@ -253,6 +240,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
 
 	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
+	pr_debug("%s: group=%p\n", __func__, group);
+
 	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
 
 	event = holder->event;
@@ -314,25 +303,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
 
 static void initialize_event(struct fsnotify_event *event)
 {
-	event->holder.event = NULL;
 	INIT_LIST_HEAD(&event->holder.event_list);
 	atomic_set(&event->refcnt, 1);
 
 	spin_lock_init(&event->lock);
 
-	event->path.dentry = NULL;
-	event->path.mnt = NULL;
-	event->inode = NULL;
-	event->data_type = FSNOTIFY_EVENT_NONE;
-
 	INIT_LIST_HEAD(&event->private_data_list);
+}
+
+/*
+ * Caller damn well better be holding whatever mutex is protecting the
+ * old_holder->event_list and the new_event must be a clean event which
+ * cannot be found anywhere else in the kernel.
+ */
+int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
+			   struct fsnotify_event *new_event)
+{
+	struct fsnotify_event *old_event = old_holder->event;
+	struct fsnotify_event_holder *new_holder = &new_event->holder;
 
-	event->to_tell = NULL;
+	enum event_spinlock_class {
+		SPINLOCK_OLD,
+		SPINLOCK_NEW,
+	};
 
-	event->file_name = NULL;
-	event->name_len = 0;
+	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
 
-	event->sync_cookie = 0;
+	/*
+	 * if the new_event's embedded holder is in use someone
+	 * screwed up and didn't give us a clean new event.
+	 */
+	BUG_ON(!list_empty(&new_holder->event_list));
+
+	spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
+	spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
+
+	new_holder->event = new_event;
+	list_replace_init(&old_holder->event_list, &new_holder->event_list);
+
+	spin_unlock(&new_event->lock);
+	spin_unlock(&old_event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (old_holder != &old_event->holder)
+		fsnotify_destroy_event_holder(old_holder);
+
+	fsnotify_get_event(new_event); /* on the list take reference */
+	fsnotify_put_event(old_event); /* off the list, drop reference */
+
+	return 0;
+}
+
+struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
+
+	memcpy(event, old_event, sizeof(*event));
+	initialize_event(event);
+
+	if (event->name_len) {
+		event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+	}
+	event->tgid = get_pid(old_event->tgid);
+	if (event->data_type == FSNOTIFY_EVENT_PATH)
+		path_get(&event->path);
+
+	return event;
 }
 
 /*
@@ -348,15 +394,18 @@ static void initialize_event(struct fsnotify_event *event)
  * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const char *name, u32 cookie,
-					     gfp_t gfp)
+					     int data_type, const unsigned char *name,
+					     u32 cookie, gfp_t gfp)
 {
 	struct fsnotify_event *event;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
+	event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
 	if (!event)
 		return NULL;
 
+	pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
+		 __func__, event, to_tell, mask, data, data_type);
+
 	initialize_event(event);
 
 	if (name) {
@@ -368,30 +417,21 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		event->name_len = strlen(event->file_name);
 	}
 
+	event->tgid = get_pid(task_tgid(current));
 	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
+	event->data_type = data_type;
 
 	switch (data_type) {
-	case FSNOTIFY_EVENT_FILE: {
-		struct file *file = data;
-		struct path *path = &file->f_path;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
-		break;
-	}
 	case FSNOTIFY_EVENT_PATH: {
 		struct path *path = data;
 		event->path.dentry = path->dentry;
 		event->path.mnt = path->mnt;
 		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
 		break;
 	}
 	case FSNOTIFY_EVENT_INODE:
 		event->inode = data;
-		event->data_type = FSNOTIFY_EVENT_INODE;
 		break;
 	case FSNOTIFY_EVENT_NONE:
 		event->inode = NULL;
@@ -412,8 +452,11 @@ __init int fsnotify_notification_init(void)
 	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
 	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
 
-	initialize_event(&q_overflow_event);
-	q_overflow_event.mask = FS_Q_OVERFLOW;
+	q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
+						 FSNOTIFY_EVENT_NONE, NULL, 0,
+						 GFP_KERNEL);
+	if (!q_overflow_event)
+		panic("unable to allocate fsnotify q_overflow_event\n");
 
 	return 0;
 }
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 00000000000..85eebff6d0d
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,191 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark, *lmark;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
+		list_add(&mark->m.free_m_list, &free_list);
+		hlist_del_init_rcu(&mark->m.m_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+}
+
+/*
+ * Recalculate the mask of events relevant to a given vfsmount locked.
+ */
+static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
+		new_mask |= mark->mask;
+	mnt->mnt_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this mount point
+ */
+void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
+{
+	spin_lock(&mnt->mnt_root->d_lock);
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
+{
+	struct vfsmount *mnt = mark->m.mnt;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	hlist_del_init_rcu(&mark->m.m_list);
+	mark->m.mnt = NULL;
+
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
+								struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * given a group and vfsmount, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
+						  struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return mark;
+}
+
+/*
+ * Attach an initialized mark to a given group and vfsmount.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which groups.
+ */
+int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+			       struct fsnotify_group *group, struct vfsmount *mnt,
+			       int allow_dups)
+{
+	struct fsnotify_mark *lmark;
+	struct hlist_node *node, *last = NULL;
+	int ret = 0;
+
+	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	mark->m.mnt = mnt;
+
+	/* is mark the first mark? */
+	if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
+		hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
+		goto out;
+	}
+
+	/* should mark be in the middle of the current list? */
+	hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+		last = node;
+
+		if ((lmark->group == group) && !allow_dups) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		if (mark->group->priority < lmark->group->priority)
+			continue;
+
+		if ((mark->group->priority == lmark->group->priority) &&
+		    (mark->group < lmark->group))
+			continue;
+
+		hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
+		goto out;
+	}
+
+	BUG_ON(last == NULL);
+	/* mark should be the last entry.  last is the current last entry */
+	hlist_add_after_rcu(last, &mark->m.m_list);
+out:
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return ret;
+}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4b57fb1eac2..93622b175fc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2238,7 +2238,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
 }
 
 /**
- * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
  * @vi:		vfs inode pending annihilation
  *
  * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2247,10 +2247,13 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
  *
  * If the MFT record is dirty, we commit it before doing anything else.
  */
-void ntfs_clear_big_inode(struct inode *vi)
+void ntfs_evict_big_inode(struct inode *vi)
 {
 	ntfs_inode *ni = NTFS_I(vi);
 
+	truncate_inode_pages(&vi->i_data, 0);
+	end_writeback(vi);
+
 #ifdef NTFS_RW
 	if (NInoDirty(ni)) {
 		bool was_bad = (is_bad_inode(vi));
@@ -2879,9 +2882,6 @@ void ntfs_truncate_vfs(struct inode *vi) {
  *
  * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
  * called with ->i_alloc_sem held for writing.
- *
- * Basically this is a copy of generic notify_change() and inode_setattr()
- * functionality, except we intercept and abort changes in i_size.
  */
 int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 9a113544605..2dabf813456 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -279,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 
 extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
 extern void ntfs_destroy_big_inode(struct inode *inode);
-extern void ntfs_clear_big_inode(struct inode *vi);
+extern void ntfs_evict_big_inode(struct inode *vi);
 
 extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0de1db6cddb..a30ecacc01f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 
 #include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	ntfs_debug("Entering with remount options string: %s", opt);
 
-	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
-			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
-			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt)) {
-		unlock_kernel();
+	if (!parse_options(vol, opt))
 		return -EINVAL;
-	}
-	unlock_kernel();
+
 	ntfs_debug("Done.");
 	return 0;
 }
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
 
 	ntfs_debug("Entering.");
 
-	lock_kernel();
-
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(vol);
-
-	unlock_kernel();
 }
 
 /**
@@ -2700,7 +2684,7 @@ static const struct super_operations ntfs_sops = {
 	.put_super	= ntfs_put_super,	/* Syscall: umount. */
 	.statfs		= ntfs_statfs,		/* Syscall: statfs */
 	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
-	.clear_inode	= ntfs_clear_big_inode,	/* VFS: Called when an inode is
+	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
 						   removed from memory. */
 	//.umount_begin	= NULL,			/* Forced umount. */
 	.show_options	= ntfs_show_options,	/* Show mount options in
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	init_rwsem(&vol->mftbmp_lock);
 	init_rwsem(&vol->lcnbmp_lock);
 
-	unlock_kernel();
-
 	/* By default, enable sparse support. */
 	NVolSetSparseEnabled(vol);
 
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto unl_upcase_iput_tmp_ino_err_out_now;
 	}
 	if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-		/* We increment i_count simulating an ntfs_iget(). */
-		atomic_inc(&vol->root_ino->i_count);
+		/* We grab a reference, simulating an ntfs_iget(). */
+		ihold(vol->root_ino);
 		ntfs_debug("Exiting, status successful.");
 		/* Release the default upcase if it has no users. */
 		mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		}
 		mutex_unlock(&ntfs_lock);
 		sb->s_export_op = &ntfs_export_ops;
-		lock_kernel();
 		lockdep_on();
 		return 0;
 	}
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
 	if (vol->mft_ino && vol->mft_ino != tmp_ino)
 		iput(vol->mft_ino);
 	vol->mft_ino = NULL;
-	/*
-	 * This is needed to get ntfs_clear_extent_inode() called for each
-	 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-	 * leak resources and B) a subsequent mount fails automatically due to
-	 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-	 * method again... FIXME: Do we need to do this twice now because of
-	 * attribute inodes? I think not, so leave as is for now... (AIA)
-	 */
-	if (invalidate_inodes(sb)) {
-		ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-				"driver bug.");
-		/* Copied from fs/super.c. I just love this message. (-; */
-		printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-				"seconds.  Have a nice day...\n");
-	}
 	/* Errors at this stage are irrelevant. */
 err_out_now:
-	lock_kernel();
 	sb->s_fs_info = NULL;
 	kfree(vol);
 	ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static int ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
 }
 
 static struct file_system_type ntfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ntfs",
-	.get_sb		= ntfs_get_sb,
+	.mount		= ntfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e..391915093fe 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
 	}
 
 	inode->i_mode = new_mode;
+	inode->i_ctime = CURRENT_TIME;
 	di->i_mode = cpu_to_le16(inode->i_mode);
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
 	ocfs2_journal_dirty(handle, di_bh);
 
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
 
 int ocfs2_check_acl(struct inode *inode, int mask)
 {
-	struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret = -EAGAIN;
 
-	if (IS_ERR(acl))
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return ret;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+
+	brelse(di_bh);
+
+	if (IS_ERR(acl)) {
+		mlog_errno(PTR_ERR(acl));
 		return PTR_ERR(acl);
+	}
 	if (acl) {
-		int ret = posix_acl_permission(inode, acl, mask);
+		ret = posix_acl_permission(inode, acl, mask);
 		posix_acl_release(acl);
 		return ret;
 	}
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct posix_acl *acl = NULL;
-	int ret = 0;
+	int ret = 0, ret2;
 	mode_t mode;
 
 	if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
 		mode = inode->i_mode;
 		ret = posix_acl_create_masq(clone, &mode);
 		if (ret >= 0) {
-			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret2) {
+				mlog_errno(ret2);
+				ret = ret2;
+				goto cleanup;
+			}
 			if (ret > 0) {
 				ret = ocfs2_set_acl(handle, inode,
 						    di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d8..592fae5007d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_CACHE_SHIFT;
 	do {
-		pages[numpages] = grab_cache_page(mapping, index);
+		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
 		if (!pages[numpages]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 356e976772b..f1e962cb3b7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	 * ocfs2 never allocates in this function - the only time we
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
-	 * allows block_prepare_write() to zero.
+	 * allows __block_write_begin() to zero.
 	 *
 	 * If we see this on a sparse file system, then a truncate has
 	 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 	return ret;
 }
 
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-			       unsigned from, unsigned to)
-{
-	int ret;
-
-	ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-	return ret;
-}
-
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
@@ -578,7 +563,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
-			     void *private)
+			     void *private,
+			     int ret,
+			     bool is_async)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
@@ -592,6 +579,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	if (!level)
 		up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, level);
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 /*
@@ -638,11 +628,10 @@ static ssize_t ocfs2_direct_IO(int rw,
 	if (i_size_read(inode) <= offset)
 		return 0;
 
-	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-					    inode->i_sb->s_bdev, iov, offset,
-					    nr_segs,
-					    ocfs2_direct_IO_get_blocks,
-					    ocfs2_dio_end_io);
+	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				   iov, offset, nr_segs,
+				   ocfs2_direct_IO_get_blocks,
+				   ocfs2_dio_end_io, NULL, 0);
 
 	mlog_exit(ret);
 	return ret;
@@ -728,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
  * mapping by now though, and the entire write will be allocating or
  * it won't, so not much need to use BH_New.
  *
@@ -879,8 +868,8 @@ struct ocfs2_write_ctxt {
 	 * out in so that future reads from that region will get
 	 * zero's.
 	 */
-	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 	unsigned int			w_num_pages;
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 	struct page			*w_target_page;
 
 	/*
@@ -1638,7 +1627,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
 	return ret;
 }
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page)
@@ -1688,7 +1678,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		mlog_errno(ret);
 		goto out;
 	} else if (ret == 1) {
-		ret = ocfs2_refcount_cow(inode, di_bh,
+		ret = ocfs2_refcount_cow(inode, filp, di_bh,
 					 wc->w_cpos, wc->w_clen, UINT_MAX);
 		if (ret) {
 			mlog_errno(ret);
@@ -1850,7 +1840,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
 				       fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc51..76bfdfda691 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-			       unsigned from, unsigned to);
-
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
@@ -48,7 +45,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata);
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d1233959..c7ee03c2222 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 
 	ocfs2_blockcheck_inc_failure(stats);
 	mlog(ML_ERROR,
-	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
 
 	/* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 		goto out;
 	}
 
-	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
 
 	rc = -EIO;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d5..52c7557f3e2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * 	- o2hb_region_bitmap allows us to limit the region number to max region.
+ * 	- o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * 	- o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * 		heartbeat on it.
+ * 	- o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES		0
+#define O2HB_DB_TYPE_LIVEREGIONS	1
+#define O2HB_DB_TYPE_QUORUMREGIONS	2
+#define O2HB_DB_TYPE_FAILEDREGIONS	3
+#define O2HB_DB_TYPE_REGION_LIVENODES	4
+#define O2HB_DB_TYPE_REGION_NUMBER	5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME	6
+struct o2hb_debug_buf {
+	int db_type;
+	int db_size;
+	int db_len;
+	void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
 #define O2HB_DEBUG_DIR			"o2hb"
 #define O2HB_DEBUG_LIVENODES		"livenodes"
+#define O2HB_DEBUG_LIVEREGIONS		"live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS	"quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS	"failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER	"num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME	"elapsed_time_in_ms"
+
 static struct dentry *o2hb_debug_dir;
 static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
 
 static LIST_HEAD(o2hb_all_regions);
 
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
 
 #define O2HB_DEFAULT_BLOCK_BITS       9
 
+enum o2hb_heartbeat_modes {
+	O2HB_HEARTBEAT_LOCAL		= 0,
+	O2HB_HEARTBEAT_GLOBAL,
+	O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+		"local",	/* O2HB_HEARTBEAT_LOCAL */
+		"global",	/* O2HB_HEARTBEAT_GLOBAL */
+};
+
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
 /* Only sets a new threshold if there are no active regions.
  *
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
 	}
 }
 
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+	int ret = -1;
+
+	if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions)) {
+			o2hb_heartbeat_mode = hb_mode;
+			ret = 0;
+		}
+		spin_unlock(&o2hb_live_lock);
+	}
+
+	return ret;
+}
+
 struct o2hb_node_event {
 	struct list_head        hn_item;
 	enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
 	struct block_device	*hr_bdev;
 	struct o2hb_disk_slot	*hr_slots;
 
+	/* live node map of this region */
+	unsigned long		hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned int		hr_region_num;
+
+	struct dentry		*hr_debug_dir;
+	struct dentry		*hr_debug_livenodes;
+	struct dentry		*hr_debug_regnum;
+	struct dentry		*hr_debug_elapsed_time;
+	struct o2hb_debug_buf	*hr_db_livenodes;
+	struct o2hb_debug_buf	*hr_db_regnum;
+	struct o2hb_debug_buf	*hr_db_elapsed_time;
+
 	/* let the person setting up hb wait for it to return until it
 	 * has reached a 'steady' state.  This will be fixed when we have
 	 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
 	int               wc_error;
 };
 
+static int o2hb_pop_count(void *map, int count)
+{
+	int i = -1, pop = 0;
+
+	while ((i = find_next_bit(map, count, i + 1)) < count)
+		pop++;
+	return pop;
+}
+
 static void o2hb_write_timeout(struct work_struct *work)
 {
+	int failed, quorum;
+	unsigned long flags;
 	struct o2hb_region *reg =
 		container_of(work, struct o2hb_region,
 			     hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
 	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
 	     "milliseconds\n", reg->hr_dev_name,
 	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock_irqsave(&o2hb_live_lock, flags);
+		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+					O2NM_MAX_REGIONS);
+		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+					O2NM_MAX_REGIONS);
+		spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+		mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+		     quorum, failed);
+
+		/*
+		 * Fence if the number of failed regions >= half the number
+		 * of  quorum regions
+		 */
+		if ((failed << 1) < quorum)
+			return;
+	}
+
 	o2quo_disk_timeout();
 }
 
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 	mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
 	     O2HB_MAX_WRITE_TIMEOUT_MS);
 
+	if (o2hb_global_heartbeat_active()) {
+		spin_lock(&o2hb_live_lock);
+		clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+		spin_unlock(&o2hb_live_lock);
+	}
 	cancel_delayed_work(&reg->hr_write_timeout_work);
 	reg->hr_last_timeout_start = jiffies;
 	schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
 {
 	assert_spin_locked(&o2hb_live_lock);
 
+	BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
 	event->hn_event_type = type;
 	event->hn_node = node;
 	event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 	o2nm_node_put(node);
 }
 
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+				   struct o2hb_disk_slot *slot)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	if (!o2hb_global_heartbeat_active())
+		return;
+
+	if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+		return;
+
+	/*
+	 * A region can be added to the quorum only when it sees all
+	 * live nodes heartbeat on it. In other words, the region has been
+	 * added to all nodes.
+	 */
+	if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+		   sizeof(o2hb_live_node_bitmap)))
+		return;
+
+	if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+		return;
+
+	printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+	       config_item_name(&reg->hr_item));
+
+	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+}
+
 static int o2hb_check_slot(struct o2hb_region *reg,
 			   struct o2hb_disk_slot *slot)
 {
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 	u64 cputime;
 	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
 	unsigned int slot_dead_ms;
+	int tmp;
 
 	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
 
-	/* Is this correct? Do we assume that the node doesn't exist
-	 * if we're not configured for him? */
+	/*
+	 * If a node is no longer configured but is still in the livemap, we
+	 * may need to clear that bit from the livemap.
+	 */
 	node = o2nm_get_node_by_num(slot->ds_node_num);
-	if (!node)
-		return 0;
+	if (!node) {
+		spin_lock(&o2hb_live_lock);
+		tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+		spin_unlock(&o2hb_live_lock);
+		if (!tmp)
+			return 0;
+	}
 
 	if (!o2hb_verify_crc(reg, hb_block)) {
 		/* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
 		mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
 		     slot->ds_node_num, (long long)slot->ds_last_generation);
 
+		set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
 		/* first on the list generates a callback */
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+			     "bitmap\n", slot->ds_node_num);
 			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
 
 			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
 		mlog(ML_HEARTBEAT, "Node %d left my region\n",
 		     slot->ds_node_num);
 
+		clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
 		/* last off the live_slot generates a callback */
 		list_del_init(&slot->ds_live_item);
 		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+			     "nodes bitmap\n", slot->ds_node_num);
 			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
 
-			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
-					      slot->ds_node_num);
+			/* node can be null */
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+					      node, slot->ds_node_num);
 
 			changed = 1;
 		}
@@ -706,11 +873,14 @@ fire_callbacks:
 		slot->ds_equal_samples = 0;
 	}
 out:
+	o2hb_set_quorum_device(reg, slot);
+
 	spin_unlock(&o2hb_live_lock);
 
 	o2hb_run_event_list(&event);
 
-	o2nm_node_put(node);
+	if (node)
+		o2nm_node_put(node);
 	return changed;
 }
 
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct o2hb_bio_wait_ctxt write_wc;
 
 	ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		return ret;
 	}
 
+	/*
+	 * If a node is not configured but is in the livemap, we still need
+	 * to read the slot so as to be able to remove it from the livemap.
+	 */
+	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	i = -1;
+	while ((i = find_next_bit(live_node_bitmap,
+				  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+		set_bit(i, configured_nodes);
+	}
+
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
 	if (highest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
 #ifdef CONFIG_DEBUG_FS
 static int o2hb_debug_open(struct inode *inode, struct file *file)
 {
+	struct o2hb_debug_buf *db = inode->i_private;
+	struct o2hb_region *reg;
 	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	char *buf = NULL;
 	int i = -1;
 	int out = 0;
 
+	/* max_nodes should be the largest bitmap we pass here */
+	BUG_ON(sizeof(map) < db->db_size);
+
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		goto bail;
 
-	o2hb_fill_node_map(map, sizeof(map));
+	switch (db->db_type) {
+	case O2HB_DB_TYPE_LIVENODES:
+	case O2HB_DB_TYPE_LIVEREGIONS:
+	case O2HB_DB_TYPE_QUORUMREGIONS:
+	case O2HB_DB_TYPE_FAILEDREGIONS:
+		spin_lock(&o2hb_live_lock);
+		memcpy(map, db->db_data, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_LIVENODES:
+		spin_lock(&o2hb_live_lock);
+		reg = (struct o2hb_region *)db->db_data;
+		memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+		spin_unlock(&o2hb_live_lock);
+		break;
+
+	case O2HB_DB_TYPE_REGION_NUMBER:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+				reg->hr_region_num);
+		goto done;
+
+	case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+		reg = (struct o2hb_region *)db->db_data;
+		out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+				jiffies_to_msecs(jiffies -
+						 reg->hr_last_timeout_start));
+		goto done;
+
+	default:
+		goto done;
+	}
 
-	while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+	while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
 		out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
 	out += snprintf(buf + out, PAGE_SIZE - out, "\n");
 
+done:
 	i_size_write(inode, out);
 
 	file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
 
 void o2hb_exit(void)
 {
-	if (o2hb_debug_livenodes)
-		debugfs_remove(o2hb_debug_livenodes);
-	if (o2hb_debug_dir)
-		debugfs_remove(o2hb_debug_dir);
+	kfree(o2hb_db_livenodes);
+	kfree(o2hb_db_liveregions);
+	kfree(o2hb_db_quorumregions);
+	kfree(o2hb_db_failedregions);
+	debugfs_remove(o2hb_debug_failedregions);
+	debugfs_remove(o2hb_debug_quorumregions);
+	debugfs_remove(o2hb_debug_liveregions);
+	debugfs_remove(o2hb_debug_livenodes);
+	debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+					struct o2hb_debug_buf **db, int db_len,
+					int type, int size, int len, void *data)
+{
+	*db = kmalloc(db_len, GFP_KERNEL);
+	if (!*db)
+		return NULL;
+
+	(*db)->db_type = type;
+	(*db)->db_size = size;
+	(*db)->db_len = len;
+	(*db)->db_data = data;
+
+	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+				   &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+	int ret = -ENOMEM;
+
+	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+	if (!o2hb_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+						 o2hb_debug_dir,
+						 &o2hb_db_livenodes,
+						 sizeof(*o2hb_db_livenodes),
+						 O2HB_DB_TYPE_LIVENODES,
+						 sizeof(o2hb_live_node_bitmap),
+						 O2NM_MAX_NODES,
+						 o2hb_live_node_bitmap);
+	if (!o2hb_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+						   o2hb_debug_dir,
+						   &o2hb_db_liveregions,
+						   sizeof(*o2hb_db_liveregions),
+						   O2HB_DB_TYPE_LIVEREGIONS,
+						   sizeof(o2hb_live_region_bitmap),
+						   O2NM_MAX_REGIONS,
+						   o2hb_live_region_bitmap);
+	if (!o2hb_debug_liveregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_quorumregions =
+			o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_quorumregions,
+					  sizeof(*o2hb_db_quorumregions),
+					  O2HB_DB_TYPE_QUORUMREGIONS,
+					  sizeof(o2hb_quorum_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_quorum_region_bitmap);
+	if (!o2hb_debug_quorumregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	o2hb_debug_failedregions =
+			o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+					  o2hb_debug_dir,
+					  &o2hb_db_failedregions,
+					  sizeof(*o2hb_db_failedregions),
+					  O2HB_DB_TYPE_FAILEDREGIONS,
+					  sizeof(o2hb_failed_region_bitmap),
+					  O2NM_MAX_REGIONS,
+					  o2hb_failed_region_bitmap);
+	if (!o2hb_debug_failedregions) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	if (ret)
+		o2hb_exit();
+
+	return ret;
 }
 
 int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
 	INIT_LIST_HEAD(&o2hb_node_events);
 
 	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
 
-	o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-	if (!o2hb_debug_dir) {
-		mlog_errno(-ENOMEM);
-		return -ENOMEM;
-	}
-
-	o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
-						   S_IFREG|S_IRUSR,
-						   o2hb_debug_dir, NULL,
-						   &o2hb_debug_fops);
-	if (!o2hb_debug_livenodes) {
-		mlog_errno(-ENOMEM);
-		debugfs_remove(o2hb_debug_dir);
-		return -ENOMEM;
-	}
-
-	return 0;
+	return o2hb_debug_init();
 }
 
 /* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
 	if (reg->hr_slots)
 		kfree(reg->hr_slots);
 
+	kfree(reg->hr_db_regnum);
+	kfree(reg->hr_db_livenodes);
+	debugfs_remove(reg->hr_debug_livenodes);
+	debugfs_remove(reg->hr_debug_regnum);
+	debugfs_remove(reg->hr_debug_elapsed_time);
+	debugfs_remove(reg->hr_debug_dir);
+
 	spin_lock(&o2hb_live_lock);
 	list_del(&reg->hr_all_item);
 	spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
 	spin_lock(&o2hb_live_lock);
 	hb_task = reg->hr_task;
+	if (o2hb_global_heartbeat_active())
+		set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
 	spin_unlock(&o2hb_live_lock);
 
 	if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	else
 		ret = -EIO;
 
+	if (hb_task && o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+		       config_item_name(&reg->hr_item));
+
 out:
 	if (filp)
 		fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+	int ret = -ENOMEM;
+
+	reg->hr_debug_dir =
+		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+	if (!reg->hr_debug_dir) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_livenodes =
+			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_livenodes),
+					  sizeof(*(reg->hr_db_livenodes)),
+					  O2HB_DB_TYPE_REGION_LIVENODES,
+					  sizeof(reg->hr_live_node_bitmap),
+					  O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_livenodes) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_regnum =
+			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_regnum),
+					  sizeof(*(reg->hr_db_regnum)),
+					  O2HB_DB_TYPE_REGION_NUMBER,
+					  0, O2NM_MAX_NODES, reg);
+	if (!reg->hr_debug_regnum) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	reg->hr_debug_elapsed_time =
+			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+					  reg->hr_debug_dir,
+					  &(reg->hr_db_elapsed_time),
+					  sizeof(*(reg->hr_db_elapsed_time)),
+					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+					  0, 0, reg);
+	if (!reg->hr_debug_elapsed_time) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	return ret;
+}
+
 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
 							  const char *name)
 {
 	struct o2hb_region *reg = NULL;
+	int ret;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
 	if (reg == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+	if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
 
 	spin_lock(&o2hb_live_lock);
+	reg->hr_region_num = 0;
+	if (o2hb_global_heartbeat_active()) {
+		reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+							 O2NM_MAX_REGIONS);
+		if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+			spin_unlock(&o2hb_live_lock);
+			return ERR_PTR(-EFBIG);
+		}
+		set_bit(reg->hr_region_num, o2hb_region_bitmap);
+	}
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
 
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+	if (ret) {
+		config_item_put(&reg->hr_item);
+		return ERR_PTR(ret);
+	}
+
 	return &reg->hr_item;
 }
 
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 
 	/* stop the thread when the user removes the region dir */
 	spin_lock(&o2hb_live_lock);
+	if (o2hb_global_heartbeat_active()) {
+		clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+		clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+	}
 	hb_task = reg->hr_task;
 	reg->hr_task = NULL;
 	spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 		wake_up(&o2hb_steady_queue);
 	}
 
+	if (o2hb_global_heartbeat_active())
+		printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+		       config_item_name(&reg->hr_item));
 	config_item_put(item);
 }
 
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
 	return count;
 }
 
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+				       char *page)
+{
+	return sprintf(page, "%s\n",
+		       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+					const char *page, size_t count)
+{
+	unsigned int i;
+	int ret;
+	size_t len;
+
+	len = (page[count - 1] == '\n') ? count - 1 : count;
+	if (!len)
+		return -EINVAL;
+
+	for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+		if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+			continue;
+
+		ret = o2hb_global_hearbeat_mode_set(i);
+		if (!ret)
+			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+			       o2hb_heartbeat_mode_desc[i]);
+		return count;
+	}
+
+	return -EINVAL;
+
+}
+
 static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
 	.attr	= { .ca_owner = THIS_MODULE,
 		    .ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
 	.store	= o2hb_heartbeat_group_threshold_store,
 };
 
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+	.attr   = { .ca_owner = THIS_MODULE,
+		.ca_name = "mode",
+		.ca_mode = S_IRUGO | S_IWUSR },
+	.show   = o2hb_heartbeat_group_mode_show,
+	.store  = o2hb_heartbeat_group_mode_store,
+};
+
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
 	&o2hb_heartbeat_group_attr_threshold.attr,
+	&o2hb_heartbeat_group_attr_mode.attr,
 	NULL,
 };
 
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
 	spin_unlock(&o2hb_live_lock);
 }
 EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+	struct o2hb_region *reg;
+	int numregs = 0;
+	char *p;
+
+	spin_lock(&o2hb_live_lock);
+
+	p = region_uuids;
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+		if (numregs < max_regions) {
+			memcpy(p, config_item_name(&reg->hr_item),
+			       O2HB_MAX_REGION_NAME_LEN);
+			p += O2HB_MAX_REGION_NAME_LEN;
+		}
+		numregs++;
+	}
+
+	spin_unlock(&o2hb_live_lock);
+
+	return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+	return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b4..00ad8e8fea5 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
 
 #define O2HB_REGION_TIMEOUT_MS		2000
 
+#define O2HB_MAX_REGION_NAME_LEN	32
+
 /* number of changes to be seen as live */
 #define O2HB_LIVE_THRESHOLD	   2
 /* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
 
 #endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa5..ea2ed9f56c9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
-#define	ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER	0x0000001000000000ULL /* cluster stack */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fe..bb240647ca5 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
+	mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
 	return &node->nd_item;
 }
 
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
 	}
 	write_unlock(&cluster->cl_nodes_lock);
 
+	mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+	     config_item_name(&node->nd_item));
+
 	config_item_put(item);
 }
 
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad57..49b594325be 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
 /* host name, group name, cluster name all 64 bytes */
 #define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
 
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION**  Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS	32
+
 #endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78d..9aa426e4212 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 			   size_t caller_veclen, u8 target_node, int *status)
 {
-	int ret;
+	int ret = 0;
 	struct o2net_msg *msg = NULL;
 	size_t veclen, caller_bytes = 0;
 	struct kvec *vec = NULL;
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
 {
 	o2quo_hb_down(node_num);
 
+	if (!node)
+		return;
+
 	if (node_num != o2nm_this_node())
 		o2net_disconnect_node(node);
 
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 
 	o2quo_hb_up(node_num);
 
+	BUG_ON(!node);
+
 	/* ensure an immediate connect attempt */
 	nn->nn_last_connect_attempt = jiffies -
 		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1759,6 +1764,7 @@ static int o2net_accept_one(struct socket *sock)
 	struct sockaddr_in sin;
 	struct socket *new_sock = NULL;
 	struct o2nm_node *node = NULL;
+	struct o2nm_node *local_node = NULL;
 	struct o2net_sock_container *sc = NULL;
 	struct o2net_node *nn;
 
@@ -1796,11 +1802,15 @@ static int o2net_accept_one(struct socket *sock)
 		goto out;
 	}
 
-	if (o2nm_this_node() > node->nd_num) {
-		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
-		     "numbered node '%s' at " "%pI4:%d with num %u\n",
-		     node->nd_name, &sin.sin_addr.s_addr,
-		     ntohs(sin.sin_port), node->nd_num);
+	if (o2nm_this_node() >= node->nd_num) {
+		local_node = o2nm_get_node_by_num(o2nm_this_node());
+		mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
+		     "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
+		     local_node->nd_name, local_node->nd_num,
+		     &(local_node->nd_ipv4_address),
+		     ntohs(local_node->nd_ipv4_port),
+		     node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+		     ntohs(sin.sin_port));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1857,6 +1867,8 @@ out:
 		sock_release(new_sock);
 	if (node)
 		o2nm_node_put(node);
+	if (local_node)
+		o2nm_node_put(local_node);
 	if (sc)
 		sc_put(sc);
 	return ret;
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530..15fdbdf9eb4 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
 
 struct o2net_sock_container {
 	struct kref		sc_kref;
-	/* the next two are vaild for the life time of the sc */
+	/* the next two are valid for the life time of the sc */
 	struct socket		*sc_sock;
 	struct o2nm_node	*sc_node;
 
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe..edaded48e7e 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
 #include "inode.h"
 #include "super.h"
 
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+	unsigned long gen =
+		OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+	BUG_ON(dentry->d_inode);
+	dentry->d_fsdata = (void *)gen;
+}
+
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
 				   struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 
-	/* Never trust a negative dentry - force a new lookup. */
+	/* For a negative dentry -
+	 * check the generation number of the parent and compare with the
+	 * one stored in the inode.
+	 */
 	if (inode == NULL) {
-		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
-		     dentry->d_name.name);
-		goto bail;
+		unsigned long gen = (unsigned long) dentry->d_fsdata;
+		unsigned long pgen =
+			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+		mlog(0, "negative dentry: %.*s parent gen: %lu "
+			"dentry gen: %lu\n",
+			dentry->d_name.len, dentry->d_name.name, pgen, gen);
+		if (gen != pgen)
+			goto bail;
+		goto valid;
 	}
 
 	BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 		goto bail;
 	}
 
+valid:
 	ret = 1;
 
 bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	if (!inode)
 		return 0;
 
+	if (!dentry->d_inode && dentry->d_fsdata) {
+		/* Converting a negative dentry to positive
+		   Clear dentry->d_fsdata */
+		dentry->d_fsdata = dl = NULL;
+	}
+
 	if (dl) {
 		mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
 				" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 out:
 	iput(inode);
+	ocfs2_dentry_attach_gen(dentry);
 }
 
 /*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf..b79eff70995 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
 		       struct inode *old_dir, struct inode *new_dir);
 
 extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
 
 #endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4..c49f6de0e7a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 		goto out_commit;
 	}
 
+	cpos = split_hash;
+	ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+				       data_ac, meta_ac, new_dx_leaves,
+				       num_dx_leaves);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
 	for (i = 0; i < num_dx_leaves; i++) {
 		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
 					      orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 			mlog_errno(ret);
 			goto out_commit;
 		}
-	}
 
-	cpos = split_hash;
-	ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
-				       data_ac, meta_ac, new_dx_leaves,
-				       num_dx_leaves);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
+		ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+					      new_dx_leaves[i],
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
 	}
 
 	ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b4..b36d0bf77a5 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
 	DLM_LOCK_REQUEST_MSG,	 /* 515 */
 	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
 	DLM_BEGIN_RECO_MSG,	 /* 517 */
-	DLM_FINALIZE_RECO_MSG	 /* 518 */
+	DLM_FINALIZE_RECO_MSG,	 /* 518 */
+	DLM_QUERY_REGION,	 /* 519 */
+	DLM_QUERY_NODEINFO,	 /* 520 */
 };
 
 struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
 	u8 domain[O2NM_MAX_NAME_LEN];
 };
 
+struct dlm_query_region {
+	u8 qr_node;
+	u8 qr_numregions;
+	u8 qr_namelen;
+	u8 pad1;
+	u8 qr_domain[O2NM_MAX_NAME_LEN];
+	u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+	u8 ni_nodenum;
+	u8 pad1;
+	u16 ni_ipv4_port;
+	u32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+	u8 qn_nodenum;
+	u8 qn_numnodes;
+	u8 qn_namelen;
+	u8 pad1;
+	u8 qn_domain[O2NM_MAX_NAME_LEN];
+	struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
 struct dlm_exit_domain
 {
 	u8 node_idx;
@@ -1030,6 +1057,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
 			 struct dlm_lock_resource *res);
 void dlm_clean_master_list(struct dlm_ctxt *dlm,
 			   u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
 int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf5439..272ec8631a5 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
 
 static int debug_buffer_release(struct inode *inode, struct file *file)
 {
-	struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+	struct debug_buffer *db = file->private_data;
 
 	if (db)
 		kfree(db->buf);
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 	struct hlist_head *bucket;
 	struct hlist_node *list;
 	int i, out = 0;
-	unsigned long total = 0, longest = 0, bktcnt;
+	unsigned long total = 0, longest = 0, bucket_count = 0;
 
 	out += snprintf(db->buf + out, db->len - out,
 			"Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 			mle = hlist_entry(list, struct dlm_master_list_entry,
 					  master_hash_node);
 			++total;
-			++bktcnt;
+			++bucket_count;
 			if (db->len - out < 200)
 				continue;
 			out += dump_mle(mle, db->buf + out, db->len - out);
 		}
-		longest = max(longest, bktcnt);
-		bktcnt = 0;
+		longest = max(longest, bucket_count);
+		bucket_count = 0;
 	}
 	spin_unlock(&dlm->master_lock);
 
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 	spin_lock(&dlm->track_lock);
 	if (oldres)
 		track_list = &oldres->tracking;
-	else
+	else {
 		track_list = &dlm->tracking_list;
+		if (list_empty(track_list)) {
+			dl = NULL;
+			spin_unlock(&dlm->track_lock);
+			goto bail;
+		}
+	}
 
 	list_for_each_entry(res, track_list, tracking) {
 		if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 	} else
 		dl = NULL;
 
+bail:
 	/* passed to seq_show */
 	return dl;
 }
@@ -715,7 +722,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
 		goto bail;
 	}
 
-	seq = (struct seq_file *) file->private_data;
+	seq = file->private_data;
 	seq->private = dl;
 
 	dlm_grab(dlm);
@@ -731,7 +738,7 @@ bail:
 
 static int debug_lockres_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct seq_file *seq = file->private_data;
 	struct debug_lockres *dl = (struct debug_lockres *)seq->private;
 
 	if (dl->dl_res)
@@ -775,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 
 	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
 	out += snprintf(db->buf + out, db->len - out,
-			"Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+			"Domain: %s  Key: 0x%08x  Protocol: %d.%d\n",
+			dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+			dlm->dlm_locking_proto.pv_minor);
 
 	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
 	out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef..58a93b95373 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * will have a negotiated version with the same major number and a minor
  * number equal or smaller.  The dlm_ctxt->dlm_locking_proto field should
  * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ *	- Message DLM_QUERY_REGION added to support global heartbeat
+ *	- Message DLM_QUERY_NODEINFO added to allow online node removes
  */
 static const struct dlm_protocol_version dlm_protocol = {
 	.pv_major = 1,
-	.pv_minor = 0,
+	.pv_minor = 1,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 				     void **ret_data);
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data);
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data);
 static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -693,6 +699,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 
 		dlm_mark_domain_leaving(dlm);
 		dlm_leave_domain(dlm);
+		dlm_force_free_mles(dlm);
 		dlm_complete_dlm_shutdown(dlm);
 	}
 	dlm_put(dlm);
@@ -920,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 	return 0;
 }
 
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+			     struct dlm_query_region *qr)
+{
+	char *local = NULL, *remote = qr->qr_regions;
+	char *l, *r;
+	int localnr, i, j, foundit;
+	int status = 0;
+
+	if (!o2hb_global_heartbeat_active()) {
+		if (qr->qr_numregions) {
+			mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+			     "heartbeat enabled but local node %d does not\n",
+			     qr->qr_domain, qr->qr_node, dlm->node_num);
+			status = -EINVAL;
+		}
+		goto bail;
+	}
+
+	if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+		mlog(ML_ERROR, "Domain %s: Local node %d has global "
+		     "heartbeat enabled but joining node %d does not\n",
+		     qr->qr_domain, dlm->node_num, qr->qr_node);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+	if (!local) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+
+	/* compare local regions with remote */
+	l = local;
+	for (i = 0; i < localnr; ++i) {
+		foundit = 0;
+		r = remote;
+		for (j = 0; j <= qr->qr_numregions; ++j) {
+			if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			r += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in local node %d but not in joining node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+			     dlm->node_num, qr->qr_node);
+			goto bail;
+		}
+		l += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+	/* compare remote with local regions */
+	r = remote;
+	for (i = 0; i < qr->qr_numregions; ++i) {
+		foundit = 0;
+		l = local;
+		for (j = 0; j < localnr; ++j) {
+			if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+				foundit = 1;
+				break;
+			}
+			l += O2HB_MAX_REGION_NAME_LEN;
+		}
+		if (!foundit) {
+			status = -EINVAL;
+			mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+			     "in joining node %d but not in local node %d\n",
+			     qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+			     qr->qr_node, dlm->node_num);
+			goto bail;
+		}
+		r += O2HB_MAX_REGION_NAME_LEN;
+	}
+
+bail:
+	kfree(local);
+
+	return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_region *qr = NULL;
+	int status, ret = 0, i;
+	char *p;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+	if (!qr) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	qr->qr_node = dlm->node_num;
+	qr->qr_namelen = strlen(dlm->name);
+	memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+	/* if local hb, the numregions will be zero */
+	if (o2hb_global_heartbeat_active())
+		qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+							 O2NM_MAX_REGIONS);
+
+	p = qr->qr_regions;
+	for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+		mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending regions to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+					 sizeof(struct dlm_query_region),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+			     ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qr);
+	return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+				    void *data, void **ret_data)
+{
+	struct dlm_query_region *qr;
+	struct dlm_ctxt *dlm = NULL;
+	int status = 0;
+	int locked = 0;
+
+	qr = (struct dlm_query_region *) msg->buf;
+
+	mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+	     qr->qr_domain);
+
+	status = -EINVAL;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "before join domain\n", qr->qr_node, qr->qr_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qr->qr_node) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for global heartbeat was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+		     "but active dlm protocol is %d.%d\n", qr->qr_node,
+		     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_regions(dlm, qr);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+	struct o2nm_node *local;
+	struct dlm_node_info *remote;
+	int i, j;
+	int status = 0;
+
+	for (j = 0; j < qn->qn_numnodes; ++j)
+		mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+		     &(qn->qn_nodes[j].ni_ipv4_address),
+		     ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+	for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+		local = o2nm_get_node_by_num(i);
+		remote = NULL;
+		for (j = 0; j < qn->qn_numnodes; ++j) {
+			if (qn->qn_nodes[j].ni_nodenum == i) {
+				remote = &(qn->qn_nodes[j]);
+				break;
+			}
+		}
+
+		if (!local && !remote)
+			continue;
+
+		if ((local && !remote) || (!local && remote))
+			status = -EINVAL;
+
+		if (!status &&
+		    ((remote->ni_nodenum != local->nd_num) ||
+		     (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+		     (remote->ni_ipv4_address != local->nd_ipv4_address)))
+			status = -EINVAL;
+
+		if (status) {
+			if (remote && !local)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in joining node %d but not in "
+				     "local node %d\n", qn->qn_domain,
+				     remote->ni_nodenum,
+				     &(remote->ni_ipv4_address),
+				     ntohs(remote->ni_ipv4_port),
+				     qn->qn_nodenum, dlm->node_num);
+			if (local && !remote)
+				mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+				     "registered in local node %d but not in "
+				     "joining node %d\n", qn->qn_domain,
+				     local->nd_num, &(local->nd_ipv4_address),
+				     ntohs(local->nd_ipv4_port),
+				     dlm->node_num, qn->qn_nodenum);
+			BUG_ON((!local && !remote));
+		}
+
+		if (local)
+			o2nm_node_put(local);
+	}
+
+	return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+	struct dlm_query_nodeinfo *qn = NULL;
+	struct o2nm_node *node;
+	int ret = 0, status, count, i;
+
+	if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+		goto bail;
+
+	qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+	if (!qn) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+		node = o2nm_get_node_by_num(i);
+		if (!node)
+			continue;
+		qn->qn_nodes[count].ni_nodenum = node->nd_num;
+		qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+		qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+		mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+		     &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+		++count;
+		o2nm_node_put(node);
+	}
+
+	qn->qn_nodenum = dlm->node_num;
+	qn->qn_numnodes = count;
+	qn->qn_namelen = strlen(dlm->name);
+	memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+	i = -1;
+	while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+				  i + 1)) < O2NM_MAX_NODES) {
+		if (i == dlm->node_num)
+			continue;
+
+		mlog(0, "Sending nodeinfo to node %d\n", i);
+
+		ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					 qn, sizeof(struct dlm_query_nodeinfo),
+					 i, &status);
+		if (ret >= 0)
+			ret = status;
+		if (ret) {
+			mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+			break;
+		}
+	}
+
+bail:
+	kfree(qn);
+	return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+				      void *data, void **ret_data)
+{
+	struct dlm_query_nodeinfo *qn;
+	struct dlm_ctxt *dlm = NULL;
+	int locked = 0, status = -EINVAL;
+
+	qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+	mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+	     qn->qn_domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+	if (!dlm) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+		     "join domain\n", qn->qn_nodenum, qn->qn_domain);
+		goto bail;
+	}
+
+	spin_lock(&dlm->spinlock);
+	locked = 1;
+	if (dlm->joining_node != qn->qn_nodenum) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+		     "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+		     dlm->joining_node);
+		goto bail;
+	}
+
+	/* Support for node query was added in 1.1 */
+	if (dlm->dlm_locking_proto.pv_major == 1 &&
+	    dlm->dlm_locking_proto.pv_minor == 0) {
+		mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+		     "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+		     qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+		     dlm->dlm_locking_proto.pv_minor);
+		goto bail;
+	}
+
+	status = dlm_match_nodes(dlm, qn);
+
+bail:
+	if (locked)
+		spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+
+	return status;
+}
+
 static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
 				   void **ret_data)
 {
@@ -1240,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	set_bit(dlm->node_num, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 
+	/* Support for global heartbeat and node info was added in 1.1 */
+	if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+		status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
 
 	/* Joined state *must* be set before the joining node
@@ -1806,7 +2191,21 @@ static int dlm_register_net_handlers(void)
 					sizeof(struct dlm_cancel_join),
 					dlm_cancel_join_handler,
 					NULL, NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+					sizeof(struct dlm_query_region),
+					dlm_query_region_handler,
+					NULL, NULL, &dlm_join_handlers);
 
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+					sizeof(struct dlm_query_nodeinfo),
+					dlm_query_nodeinfo_handler,
+					NULL, NULL, &dlm_join_handlers);
 bail:
 	if (status < 0)
 		dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88..f564b0e5f80 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
 
 	atomic_dec(&dlm->res_cur_count);
 
-	dlm_put(dlm);
-
 	if (!hlist_unhashed(&res->hash_node) ||
 	    !list_empty(&res->granted) ||
 	    !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
 
-	/* put in dlm_lockres_release */
-	dlm_grab(dlm);
 	res->dlm = dlm;
 
 	kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* check for pre-existing lock */
 	spin_lock(&dlm->spinlock);
 	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
-	spin_lock(&dlm->master_lock);
-
 	if (res) {
 		spin_lock(&res->spinlock);
 		if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 		spin_unlock(&res->spinlock);
 	}
 
+	spin_lock(&dlm->master_lock);
 	/* ignore status.  only nonzero status would BUG. */
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
 				    name, namelen,
 				    migrate->new_master,
 				    migrate->master);
 
-unlock:
 	spin_unlock(&dlm->master_lock);
+unlock:
 	spin_unlock(&dlm->spinlock);
 
 	if (oldmle) {
@@ -3438,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
 	wake_up(&res->wq);
 	wake_up(&dlm->migration_wq);
 }
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct hlist_head *bucket;
+	struct dlm_master_list_entry *mle;
+	struct hlist_node *tmp, *list;
+
+	/*
+	 * We notified all other nodes that we are exiting the domain and
+	 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+	 * around we force free them and wake any processes that are waiting
+	 * on the mles
+	 */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+
+	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = dlm_master_hash(dlm, i);
+		hlist_for_each_safe(list, tmp, bucket) {
+			mle = hlist_entry(list, struct dlm_master_list_entry,
+					  master_hash_node);
+			if (mle->type != DLM_MLE_BLOCK) {
+				mlog(ML_ERROR, "bad mle: %p\n", mle);
+				dlm_print_one_mle(mle);
+			}
+			atomic_set(&mle->woken, 1);
+			wake_up(&mle->wq);
+
+			__dlm_unlink_mle(dlm, mle);
+			__dlm_mle_detach_hb_events(dlm, mle);
+			__dlm_put_mle(mle);
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36..aaaffbcbe91 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 	struct list_head *queue;
 	struct dlm_lock *lock, *next;
 
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
 	res->state |= DLM_LOCK_RES_RECOVERING;
 	if (!list_empty(&res->recovering)) {
 		mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 			/* zero the lvb if necessary */
 			dlm_revalidate_lvb(dlm, res, dead_node);
 			if (res->owner == dead_node) {
-				if (res->state & DLM_LOCK_RES_DROPPING_REF)
-					mlog(0, "%s:%.*s: owned by "
-					     "dead node %u, this node was "
-					     "dropping its ref when it died. "
-					     "continue, dropping the flag.\n",
-					     dlm->name, res->lockname.len,
-					     res->lockname.name, dead_node);
-
-				/* the wake_up for this will happen when the
-				 * RECOVERING flag is dropped later */
-				res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+				if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+					mlog(ML_NOTICE, "Ignore %.*s for "
+					     "recovery as it is being freed\n",
+					     res->lockname.len,
+					     res->lockname.name);
+				} else
+					dlm_move_lockres_to_recovery_list(dlm,
+									  res);
 
-				dlm_move_lockres_to_recovery_list(dlm, res);
 			} else if (res->owner == dlm->node_num) {
 				dlm_free_dead_locks(dlm, res, dead_node);
 				__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe..2211acf33d9 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
  * truly ready to be freed. */
 int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
-	if (!__dlm_lockres_has_locks(res) &&
-	    (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
-		/* try not to scan the bitmap unless the first two
-		 * conditions are already true */
-		int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-		if (bit >= O2NM_MAX_NODES) {
-			/* since the bit for dlm->node_num is not
-			 * set, inflight_locks better be zero */
-			BUG_ON(res->inflight_locks != 0);
-			return 1;
-		}
-	}
-	return 0;
+	int bit;
+
+	if (__dlm_lockres_has_locks(res))
+		return 0;
+
+	if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+		return 0;
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		return 0;
+
+	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+	if (bit < O2NM_MAX_NODES)
+		return 0;
+
+	/*
+	 * since the bit for dlm->node_num is not set, inflight_locks better
+	 * be zero
+	 */
+	BUG_ON(res->inflight_locks != 0);
+	return 1;
 }
 
 
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 	spin_unlock(&dlm->spinlock);
 }
 
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 			     struct dlm_lock_resource *res)
 {
 	int master;
 	int ret = 0;
 
-	spin_lock(&res->spinlock);
-	if (!__dlm_lockres_unused(res)) {
-		mlog(0, "%s:%.*s: tried to purge but not unused\n",
-		     dlm->name, res->lockname.len, res->lockname.name);
-		__dlm_print_one_lock_resource(res);
-		spin_unlock(&res->spinlock);
-		BUG();
-	}
-
-	if (res->state & DLM_LOCK_RES_MIGRATING) {
-		mlog(0, "%s:%.*s: Delay dropref as this lockres is "
-		     "being remastered\n", dlm->name, res->lockname.len,
-		     res->lockname.name);
-		/* Re-add the lockres to the end of the purge list */
-		if (!list_empty(&res->purge)) {
-			list_del_init(&res->purge);
-			list_add_tail(&res->purge, &dlm->purge_list);
-		}
-		spin_unlock(&res->spinlock);
-		return 0;
-	}
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
 
 	master = (res->owner == dlm->node_num);
 
-	if (!master)
-		res->state |= DLM_LOCK_RES_DROPPING_REF;
-	spin_unlock(&res->spinlock);
 
 	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
 	     res->lockname.name, master);
 
 	if (!master) {
+		res->state |= DLM_LOCK_RES_DROPPING_REF;
 		/* drop spinlock...  retake below */
+		spin_unlock(&res->spinlock);
 		spin_unlock(&dlm->spinlock);
 
 		spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
 		     dlm->name, res->lockname.len, res->lockname.name, ret);
 		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
 	}
 
-	spin_lock(&res->spinlock);
 	if (!list_empty(&res->purge)) {
 		mlog(0, "removing lockres %.*s:%p from purgelist, "
 		     "master = %d\n", res->lockname.len, res->lockname.name,
 		     res, master);
 		list_del_init(&res->purge);
-		spin_unlock(&res->spinlock);
 		dlm_lockres_put(res);
 		dlm->purge_count--;
-	} else
-		spin_unlock(&res->spinlock);
+	}
+
+	if (!__dlm_lockres_unused(res)) {
+		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		__dlm_print_one_lock_resource(res);
+		BUG();
+	}
 
 	__dlm_unhash_lockres(res);
 
 	/* lockres is not in the hash now.  drop the flag and wake up
 	 * any processes waiting in dlm_get_lock_resource. */
 	if (!master) {
-		spin_lock(&res->spinlock);
 		res->state &= ~DLM_LOCK_RES_DROPPING_REF;
 		spin_unlock(&res->spinlock);
 		wake_up(&res->wq);
-	}
-	return 0;
+	} else
+		spin_unlock(&res->spinlock);
 }
 
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 		lockres = list_entry(dlm->purge_list.next,
 				     struct dlm_lock_resource, purge);
 
-		/* Status of the lockres *might* change so double
-		 * check. If the lockres is unused, holding the dlm
-		 * spinlock will prevent people from getting and more
-		 * refs on it -- there's no need to keep the lockres
-		 * spinlock. */
 		spin_lock(&lockres->spinlock);
-		unused = __dlm_lockres_unused(lockres);
-		spin_unlock(&lockres->spinlock);
-
-		if (!unused)
-			continue;
 
 		purge_jiffies = lockres->last_used +
 			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			 * in tail order, we can stop at the first
 			 * unpurgable resource -- anyone added after
 			 * him will have a greater last_used value */
+			spin_unlock(&lockres->spinlock);
 			break;
 		}
 
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it. */
+		unused = __dlm_lockres_unused(lockres);
+		if (!unused ||
+		    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+			mlog(0, "lockres %s:%.*s: is in use or "
+			     "being remastered, used %d, state %d\n",
+			     dlm->name, lockres->lockname.len,
+			     lockres->lockname.name, !unused, lockres->state);
+			list_move_tail(&dlm->purge_list, &lockres->purge);
+			spin_unlock(&lockres->spinlock);
+			continue;
+		}
+
 		dlm_lockres_get(lockres);
 
-		/* This may drop and reacquire the dlm spinlock if it
-		 * has to do migration. */
-		if (dlm_purge_lockres(dlm, lockres))
-			BUG();
+		dlm_purge_lockres(dlm, lockres);
 
 		dlm_lockres_put(lockres);
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f..b2df490a19e 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode,
 {
 	int level, status;
 	struct dlmfs_inode_private *ip = DLMFS_I(inode);
-	struct dlmfs_filp_private *fp =
-		(struct dlmfs_filp_private *) file->private_data;
+	struct dlmfs_filp_private *fp = file->private_data;
 
 	if (S_ISDIR(inode->i_mode))
 		BUG();
@@ -214,10 +213,12 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
 
 	attr->ia_valid &= ~ATTR_SIZE;
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
+	if (error)
+		return error;
 
-	return error;
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
@@ -355,13 +356,12 @@ static void dlmfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
 
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
 	struct dlmfs_inode_private *ip;
 
-	if (!inode)
-		return;
+	end_writeback(inode);
 
 	mlog(0, "inode %lu\n", inode->i_ino);
 
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		ip = DLMFS_I(inode);
 
+		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	if (!inode)
 		return NULL;
 
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
@@ -612,6 +614,7 @@ static const struct file_operations dlmfs_file_operations = {
 	.poll		= dlmfs_file_poll,
 	.read		= dlmfs_file_read,
 	.write		= dlmfs_file_write,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -631,7 +634,7 @@ static const struct super_operations dlmfs_ops = {
 	.statfs		= simple_statfs,
 	.alloc_inode	= dlmfs_alloc_inode,
 	.destroy_inode	= dlmfs_destroy_inode,
-	.clear_inode	= dlmfs_clear_inode,
+	.evict_inode	= dlmfs_evict_inode,
 	.drop_inode	= generic_delete_inode,
 };
 
@@ -640,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
 	.setattr	= dlmfs_file_setattr,
 };
 
-static int dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 }
 
 static struct file_system_type dlmfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ocfs2_dlmfs",
-	.get_sb		= dlmfs_get_sb,
+	.mount		= dlmfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 39eb16ac5f9..e8d94d722ec 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
 
 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq = (struct seq_file *) file->private_data;
+	struct seq_file *seq = file->private_data;
 	struct ocfs2_dlm_seq_priv *priv = seq->private;
 	struct ocfs2_lock_res *res = &priv->p_iter_res;
 
@@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	seq = (struct seq_file *) file->private_data;
+	seq = file->private_data;
 	seq->private = priv;
 
 	ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 {
 	struct inode *inode;
 	struct address_space *mapping;
+	struct ocfs2_inode_info *oi;
 
        	inode = ocfs2_lock_res_inode(lockres);
 	mapping = inode->i_mapping;
 
+	if (S_ISDIR(inode->i_mode)) {
+		oi = OCFS2_I(inode);
+		oi->ip_dir_lock_gen++;
+		mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+		goto out;
+	}
+
 	if (!S_ISREG(inode->i_mode))
 		goto out;
 
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d..1d596d8c4a4 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
 	OI_LS_PARENT,
 	OI_LS_RENAME1,
 	OI_LS_RENAME2,
+	OI_LS_REFLINK_TARGET,
 };
 
 int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2b10b36d157..77b4c04a280 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -63,12 +64,6 @@
 
 #include "buffer_head_io.h"
 
-static int ocfs2_sync_inode(struct inode *inode)
-{
-	filemap_fdatawrite(inode->i_mapping);
-	return sync_mapping_buffers(inode->i_mapping);
-}
-
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp;
@@ -179,19 +174,22 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 {
 	int err = 0;
 	journal_t *journal;
-	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
-		   dentry->d_name.len, dentry->d_name.name);
-
-	err = ocfs2_sync_inode(dentry->d_inode);
-	if (err)
-		goto bail;
+	mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+		   file->f_path.dentry, file->f_path.dentry->d_name.len,
+		   file->f_path.dentry->d_name.name);
 
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+		/*
+		 * We still have to flush drive's caches to get data to the
+		 * platter
+		 */
+		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		goto bail;
+	}
 
 	journal = osb->journal->j_journal;
 	err = jbd2_journal_force_commit(journal);
@@ -361,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
 	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 		goto out;
 
-	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 
 out:
 	return status;
@@ -774,7 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
-	page = grab_cache_page(mapping, index);
+	page = find_or_create_page(mapping, index, GFP_NOFS);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -798,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 		block_end = block_start + (1 << inode->i_blkbits);
 
 		/*
-		 * block_start is block-aligned.  Bump it by one to
-		 * force ocfs2_{prepare,commit}_write() to zero the
+		 * block_start is block-aligned.  Bump it by one to force
+		 * __block_write_begin and block_commit_write to zero the
 		 * whole block.
 		 */
-		ret = ocfs2_prepare_write_nolock(inode, page,
-						 block_start + 1,
-						 block_start + 1);
+		ret = __block_write_begin(page, block_start + 1, 0,
+					  ocfs2_get_block);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
@@ -904,8 +901,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
 		zero_clusters = last_cpos - zero_cpos;
 
 	if (needs_cow) {
-		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-					UINT_MAX);
+		rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+					zero_clusters, UINT_MAX);
 		if (rc) {
 			mlog_errno(rc);
 			goto out;
@@ -1233,18 +1230,26 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/*
-	 * This will intentionally not wind up calling simple_setsize(),
+	 * This will intentionally not wind up calling truncate_setsize(),
 	 * since all the work for a size change has been done above.
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
 	 * changes.
+	 *
+	 * XXX: this means the conditional below can probably be removed.
 	 */
-	status = inode_setattr(inode, attr);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_commit;
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		status = vmtruncate(inode, attr->ia_size);
+		if (status) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
 	}
 
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (status < 0)
 		mlog_errno(status);
@@ -2045,6 +2050,7 @@ out:
 }
 
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    struct file *file,
 					    loff_t pos, size_t count,
 					    int *meta_level)
 {
@@ -2062,7 +2068,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 
 	*meta_level = 1;
 
-	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -2070,7 +2076,7 @@ out:
 	return ret;
 }
 
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t *ppos,
 					 size_t count,
 					 int appending,
@@ -2078,6 +2084,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 int *has_refcount)
 {
 	int ret = 0, meta_level = 0;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	loff_t saved_pos, end;
 
@@ -2133,6 +2140,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			meta_level = -1;
 
 			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       file,
 							       saved_pos,
 							       count,
 							       &meta_level);
@@ -2215,6 +2223,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int full_coherency = !(osb->s_mount_opt &
+			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
@@ -2238,16 +2248,39 @@ relock:
 		have_alloc_sem = 1;
 	}
 
-	/* concurrent O_DIRECT writes are allowed */
-	rw_level = !direct_io;
+	/*
+	 * Concurrent O_DIRECT writes are allowed with
+	 * mount_option "coherency=buffered".
+	 */
+	rw_level = (!direct_io || full_coherency);
+
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 
+	/*
+	 * O_DIRECT writes with "coherency=full" need to take EX cluster
+	 * inode_lock to guarantee coherency.
+	 */
+	if (direct_io && full_coherency) {
+		/*
+		 * We need to take and drop the inode lock to force
+		 * other nodes to drop their caches.  Buffered I/O
+		 * already does this in write_begin().
+		 */
+		ret = ocfs2_inode_lock(inode, NULL, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_sems;
+		}
+
+		ocfs2_inode_unlock(inode, 1);
+	}
+
 	can_do_direct = direct_io;
-	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+	ret = ocfs2_prepare_inode_for_write(file, ppos,
 					    iocb->ki_left, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
@@ -2295,17 +2328,6 @@ relock:
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
-			/*
-			 * direct write may have instantiated a few
-			 * blocks outside i_size. Trim these off again.
-			 * Don't need i_size_read because we hold i_mutex.
-			 *
-			 * XXX(hch): this looks buggy because ocfs2 did not
-			 * actually implement ->truncate.  Take a look at
-			 * the new truncate sequence and update this accordingly
-			 */
-			if (*ppos + count > inode->i_size)
-				simple_setsize(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
@@ -2321,7 +2343,7 @@ out_dio:
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
 	if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
-	    ((file->f_flags & O_DIRECT) && has_refcount)) {
+	    ((file->f_flags & O_DIRECT) && !direct_io)) {
 		ret = filemap_fdatawrite_range(file->f_mapping, pos,
 					       pos + count - 1);
 		if (ret < 0)
@@ -2377,7 +2399,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 {
 	int ret;
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry,	&sd->pos,
+	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
 					    sd->total_len, 0, NULL, NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index abb0a95cc71..f935fd6600d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		    else
 			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    OCFS2_I(inode)->ip_dir_lock_gen = 1;
 		    break;
 	    case S_IFLNK:
 		    if (ocfs2_inode_is_fast_symlink(inode))
@@ -488,7 +489,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 						     OCFS2_BH_IGNORE_CACHE);
 	} else {
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
-		if (!status)
+		/*
+		 * If buffer is in jbd, then its checksum may not have been
+		 * computed as yet.
+		 */
+		if (!status && !buffer_jbd(bh))
 			status = ocfs2_validate_inode_block(osb->sb, bh);
 	}
 	if (status < 0) {
@@ -969,7 +974,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 	truncate_inode_pages(&inode->i_data, 0);
 }
 
-void ocfs2_delete_inode(struct inode *inode)
+static void ocfs2_delete_inode(struct inode *inode)
 {
 	int wipe, status;
 	sigset_t oldset;
@@ -1075,20 +1080,17 @@ bail_unlock_nfs_sync:
 bail_unblock:
 	ocfs2_unblock_signals(&oldset);
 bail:
-	clear_inode(inode);
 	mlog_exit_void();
 }
 
-void ocfs2_clear_inode(struct inode *inode)
+static void ocfs2_clear_inode(struct inode *inode)
 {
 	int status;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 	mlog_entry_void();
 
-	if (!inode)
-		goto bail;
-
+	end_writeback(inode);
 	mlog(0, "Clearing inode: %llu, nlink = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
 
@@ -1180,16 +1182,27 @@ void ocfs2_clear_inode(struct inode *inode)
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
 
-bail:
 	mlog_exit_void();
 }
 
+void ocfs2_evict_inode(struct inode *inode)
+{
+	if (!inode->i_nlink ||
+	    (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+		ocfs2_delete_inode(inode);
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
+	}
+	ocfs2_clear_inode(inode);
+}
+
 /* Called under inode_lock, with no more references on the
  * struct inode, so it's safe here to check the flags field
  * and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int res;
 
 	mlog_entry_void();
 
@@ -1197,11 +1210,12 @@ void ocfs2_drop_inode(struct inode *inode)
 	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-		generic_delete_inode(inode);
+		res = 1;
 	else
-		generic_drop_inode(inode);
+		res = generic_drop_inode(inode);
 
 	mlog_exit_void();
+	return res;
 }
 
 /*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9f5f5fcadc4..1c508b149b3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
-	u32				ip_clusters;
 	struct list_head		ip_io_markers;
+	u32				ip_clusters;
 
+	u16				ip_dyn_features;
 	struct mutex			ip_io_mutex;
-
 	u32				ip_flags; /* see below */
 	u32				ip_attr; /* inode attributes */
-	u16				ip_dyn_features;
 
 	/* protected by recovery_lock. */
 	struct inode			*ip_next_orphan;
 
-	u32				ip_dir_start_lookup;
-
 	struct ocfs2_caching_info	ip_metadata_cache;
-
 	struct ocfs2_extent_map		ip_extent_map;
-
 	struct inode			vfs_inode;
 	struct jbd2_inode		ip_jinode;
 
+	u32				ip_dir_start_lookup;
+
 	/* Only valid if the inode is the dir. */
 	u32				ip_last_used_slot;
 	u64				ip_last_used_group;
+	u32				ip_dir_lock_gen;
 
 	struct ocfs2_alloc_reservation	ip_la_data_resv;
 };
@@ -123,9 +121,8 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
 	return &OCFS2_I(inode)->ip_metadata_cache;
 }
 
-void ocfs2_clear_inode(struct inode *inode);
-void ocfs2_delete_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
 
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE		0x1
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132ce..7a486819615 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
 
 #include <linux/ext2_fs.h>
 
+#define o2info_from_user(a, b)	\
+		copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)	\
+		copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+					struct ocfs2_info_request __user *req)
+{
+	kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+	(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+		__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
 	int status;
@@ -109,6 +129,328 @@ bail:
 	return status;
 }
 
+int ocfs2_info_handle_blocksize(struct inode *inode,
+				struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_blocksize oib;
+
+	if (o2info_from_user(oib, req))
+		goto bail;
+
+	oib.ib_blocksize = inode->i_sb->s_blocksize;
+	oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oib, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oib, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_clustersize oic;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oic, req))
+		goto bail;
+
+	oic.ic_clustersize = osb->s_clustersize;
+	oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oic, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oic, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+			       struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_maxslots oim;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oim, req))
+		goto bail;
+
+	oim.im_max_slots = osb->max_slots;
+	oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oim, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oim, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+			    struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_label oil;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oil, req))
+		goto bail;
+
+	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+	oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oil, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oil, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+			   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_uuid oiu;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oiu, req))
+		goto bail;
+
+	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+	oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oiu, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oiu, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+				  struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_fs_features oif;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oif, req))
+		goto bail;
+
+	oif.if_compat_features = osb->s_feature_compat;
+	oif.if_incompat_features = osb->s_feature_incompat;
+	oif.if_ro_compat_features = osb->s_feature_ro_compat;
+	oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oif, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oif, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+				   struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_journal_size oij;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (o2info_from_user(oij, req))
+		goto bail;
+
+	oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+	oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oij, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oij, req);
+
+	return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+	if (o2info_to_user(oir, req))
+		goto bail;
+
+	status = 0;
+bail:
+	if (status)
+		o2info_set_request_error(oir, req);
+
+	return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+			      struct ocfs2_info_request __user *req)
+{
+	int status = -EFAULT;
+	struct ocfs2_info_request oir;
+
+	if (o2info_from_user(oir, req))
+		goto bail;
+
+	status = -EINVAL;
+	if (oir.ir_magic != OCFS2_INFO_MAGIC)
+		goto bail;
+
+	switch (oir.ir_code) {
+	case OCFS2_INFO_BLOCKSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+			status = ocfs2_info_handle_blocksize(inode, req);
+		break;
+	case OCFS2_INFO_CLUSTERSIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+			status = ocfs2_info_handle_clustersize(inode, req);
+		break;
+	case OCFS2_INFO_MAXSLOTS:
+		if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+			status = ocfs2_info_handle_maxslots(inode, req);
+		break;
+	case OCFS2_INFO_LABEL:
+		if (oir.ir_size == sizeof(struct ocfs2_info_label))
+			status = ocfs2_info_handle_label(inode, req);
+		break;
+	case OCFS2_INFO_UUID:
+		if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+			status = ocfs2_info_handle_uuid(inode, req);
+		break;
+	case OCFS2_INFO_FS_FEATURES:
+		if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+			status = ocfs2_info_handle_fs_features(inode, req);
+		break;
+	case OCFS2_INFO_JOURNAL_SIZE:
+		if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+			status = ocfs2_info_handle_journal_size(inode, req);
+		break;
+	default:
+		status = ocfs2_info_handle_unknown(inode, req);
+		break;
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+			  u64 *req_addr, int compat_flag)
+{
+	int status = -EFAULT;
+	u64 __user *bp = NULL;
+
+	if (compat_flag) {
+#ifdef CONFIG_COMPAT
+		/*
+		 * pointer bp stores the base address of a pointers array,
+		 * which collects all addresses of separate request.
+		 */
+		bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+		BUG();
+#endif
+	} else
+		bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+	if (o2info_from_user(*req_addr, bp + idx))
+		goto bail;
+
+	status = 0;
+bail:
+	return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+		      int compat_flag)
+{
+	int i, status = 0;
+	u64 req_addr;
+	struct ocfs2_info_request __user *reqp;
+
+	if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+	    (!info->oi_requests)) {
+		status = -EINVAL;
+		goto bail;
+	}
+
+	for (i = 0; i < info->oi_count; i++) {
+
+		status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+		if (status)
+			break;
+
+		reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+		if (!reqp) {
+			status = -EINVAL;
+			goto bail;
+		}
+
+		status = ocfs2_info_handle_request(inode, reqp);
+		if (status)
+			break;
+	}
+
+bail:
+	return status;
+}
+
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct reflink_arguments args;
 	const char *old_path, *new_path;
 	bool preserve;
+	struct ocfs2_info info;
 
 	switch (cmd) {
 	case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		preserve = (args.preserve != 0);
 
 		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 0);
 	default:
 		return -ENOTTY;
 	}
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	bool preserve;
 	struct reflink_arguments args;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_info info;
 
 	switch (cmd) {
 	case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 		return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
 					   compat_ptr(args.new_path), preserve);
+	case OCFS2_IOC_INFO:
+		if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+				   sizeof(struct ocfs2_info)))
+			return -EFAULT;
+
+		return ocfs2_info_handle(inode, &info, 1);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 625de9d7088..faa2303dbf0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
 	int status = 0;
 	unsigned int flushed;
-	unsigned long old_id;
 	struct ocfs2_journal *journal = NULL;
 
 	mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}
 
-	old_id = ocfs2_inc_trans_id(journal);
+	ocfs2_inc_trans_id(journal);
 
 	flushed = atomic_read(&journal->j_num_trans);
 	atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
 	return status;
 }
 
-/* pass it NULL and it will allocate a new handle object for you.  If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 {
 	journal_t *journal = osb->journal->j_journal;
@@ -760,13 +756,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	if (osb->osb_commit_interval)
 		commit_interval = osb->osb_commit_interval;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 
 	os = &osb->osb_orphan_scan;
 
+	mlog(0, "Begin orphan scan\n");
+
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
 		goto out;
 
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
+	mlog(0, "Orphan scan completed\n");
 	return;
 }
 
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710..43e56b97f9c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
 	struct buffer_head        *j_bh;      /* Journal disk inode block */
 	atomic_t                  j_num_trans; /* Number of transactions
 					        * currently in the system. */
+	spinlock_t                j_lock;
 	unsigned long             j_trans_id;
 	struct rw_semaphore       j_trans_barrier;
 	wait_queue_head_t         j_checkpointed;
 
-	spinlock_t                j_lock;
+	/* both fields protected by j_lock*/
 	struct list_head          j_la_cleanups;
 	struct work_struct        j_recovery_work;
 };
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f13..7e32db9c2c9 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	return ret;
 }
 
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 				struct page *page)
 {
 	int ret;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t pos = page_offset(page);
 	unsigned int len = PAGE_CACHE_SIZE;
@@ -74,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	/*
 	 * Another node might have truncated while we were waiting on
 	 * cluster locks.
+	 * We don't check size == 0 before the shift. This is borrowed
+	 * from do_generic_file_read.
 	 */
-	last_index = size >> PAGE_CACHE_SHIFT;
-	if (page->index > last_index) {
+	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	if (unlikely(!size || page->index > last_index)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -107,9 +110,9 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	 * because the "write" would invalidate their data.
 	 */
 	if (page->index == last_index)
-		len = size & ~PAGE_CACHE_MASK;
+		len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
 
-	ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
 				       &fsdata, di_bh, page);
 	if (ret) {
 		if (ret != -ENOSPC)
@@ -157,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f..ff5744e1e36 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
 			ret = ERR_PTR(status);
 			goto bail_unlock;
 		}
-	}
+	} else
+		ocfs2_dentry_attach_gen(dentry);
 
 bail_unlock:
 	/* Don't drop the cluster lock until *after* the d_add --
@@ -472,32 +473,23 @@ leave:
 	return status;
 }
 
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
-			      struct inode *dir,
-			      struct inode *inode,
-			      dev_t dev,
-			      struct buffer_head **new_fe_bh,
-			      struct buffer_head *parent_fe_bh,
-			      handle_t *handle,
-			      struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+				struct inode *inode,
+				dev_t dev,
+				struct buffer_head **new_fe_bh,
+				struct buffer_head *parent_fe_bh,
+				handle_t *handle,
+				struct ocfs2_alloc_context *inode_ac,
+				u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
 {
 	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_extent_list *fel;
-	u64 suballoc_loc, fe_blkno = 0;
-	u16 suballoc_bit;
 	u16 feat;
 
 	*new_fe_bh = NULL;
 
-	status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
-				       inode_ac, &suballoc_loc,
-				       &suballoc_bit, &fe_blkno);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	/* populate as many fields early on as possible - many of
 	 * these are used by the support functions here and in
 	 * callers. */
@@ -591,6 +583,34 @@ leave:
 	return status;
 }
 
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct inode *inode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      handle_t *handle,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	u64 suballoc_loc, fe_blkno = 0;
+	u16 suballoc_bit;
+
+	*new_fe_bh = NULL;
+
+	status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+				       inode_ac, &suballoc_loc,
+				       &suballoc_bit, &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+
+	return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+				    parent_fe_bh, handle, inode_ac,
+				    fe_blkno, suballoc_loc, suballoc_bit);
+}
+
 static int ocfs2_mkdir(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode)
@@ -722,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_commit;
 	}
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 
@@ -1852,61 +1872,117 @@ bail:
 	return status;
 }
 
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
-				    struct inode **ret_orphan_dir,
-				    u64 blkno,
-				    char *name,
-				    struct ocfs2_dir_lookup_result *lookup)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+					struct inode **ret_orphan_dir,
+					struct buffer_head **ret_orphan_dir_bh)
 {
 	struct inode *orphan_dir_inode;
 	struct buffer_head *orphan_dir_bh = NULL;
-	int status = 0;
-
-	status = ocfs2_blkno_stringify(blkno, name);
-	if (status < 0) {
-		mlog_errno(status);
-		return status;
-	}
+	int ret = 0;
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
 						       osb->slot_num);
 	if (!orphan_dir_inode) {
-		status = -ENOENT;
-		mlog_errno(status);
-		return status;
+		ret = -ENOENT;
+		mlog_errno(ret);
+		return ret;
 	}
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
 
-	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
+	ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (ret < 0) {
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+
+		mlog_errno(ret);
+		return ret;
 	}
 
-	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
-					      orphan_dir_bh, name,
-					      OCFS2_ORPHAN_NAMELEN, lookup);
-	if (status < 0) {
-		ocfs2_inode_unlock(orphan_dir_inode, 1);
+	*ret_orphan_dir = orphan_dir_inode;
+	*ret_orphan_dir_bh = orphan_dir_bh;
 
-		mlog_errno(status);
-		goto leave;
+	return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+				      struct buffer_head *orphan_dir_bh,
+				      u64 blkno,
+				      char *name,
+				      struct ocfs2_dir_lookup_result *lookup)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+	ret = ocfs2_blkno_stringify(blkno, name);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					   orphan_dir_bh, name,
+					   OCFS2_ORPHAN_NAMELEN, lookup);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ *          ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct inode **ret_orphan_dir,
+				    u64 blkno,
+				    char *name,
+				    struct ocfs2_dir_lookup_result *lookup)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int ret = 0;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+					   &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+					 blkno, name, lookup);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	*ret_orphan_dir = orphan_dir_inode;
 
-leave:
-	if (status) {
+out:
+	brelse(orphan_dir_bh);
+
+	if (ret) {
+		ocfs2_inode_unlock(orphan_dir_inode, 1);
 		mutex_unlock(&orphan_dir_inode->i_mutex);
 		iput(orphan_dir_inode);
 	}
 
-	brelse(orphan_dir_bh);
-
-	mlog_exit(status);
-	return status;
+	mlog_exit(ret);
+	return ret;
 }
 
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2129,99 @@ leave:
 	return status;
 }
 
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+					struct buffer_head *dir_bh,
+					char *orphan_name,
+					struct inode **ret_orphan_dir,
+					u64 *ret_di_blkno,
+					struct ocfs2_dir_lookup_result *orphan_insert,
+					struct ocfs2_alloc_context **ret_inode_ac)
+{
+	int ret;
+	u64 di_blkno;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct inode *orphan_dir = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+
+	ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* reserve an inode spot */
+	ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+				       &di_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+					 di_blkno, orphan_name, orphan_insert);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret == 0) {
+		*ret_orphan_dir = orphan_dir;
+		*ret_di_blkno = di_blkno;
+		*ret_inode_ac = inode_ac;
+		/*
+		 * orphan_name and orphan_insert are already up to
+		 * date via prepare_orphan_dir
+		 */
+	} else {
+		/* Unroll reserve_new_inode* */
+		if (inode_ac)
+			ocfs2_free_alloc_context(inode_ac);
+
+		/* Unroll orphan dir locking */
+		mutex_unlock(&orphan_dir->i_mutex);
+		ocfs2_inode_unlock(orphan_dir, 1);
+		iput(orphan_dir);
+	}
+
+	brelse(orphan_dir_bh);
+
+	return 0;
+}
+
 int ocfs2_create_inode_in_orphan(struct inode *dir,
 				 int mode,
 				 struct inode **new_inode)
@@ -2068,6 +2237,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	struct buffer_head *new_di_bh = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+	u64 uninitialized_var(di_blkno), suballoc_loc;
+	u16 suballoc_bit;
 
 	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
 	if (status < 0) {
@@ -2076,20 +2247,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 		return status;
 	}
 
-	/*
-	 * We give the orphan dir the root blkno to fake an orphan name,
-	 * and allocate enough space for our insertion.
-	 */
-	status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
-					  osb->root_blkno,
-					  orphan_name, &orphan_insert);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* reserve an inode spot */
-	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+					      orphan_name, &orphan_dir,
+					      &di_blkno, &orphan_insert, &inode_ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -2116,17 +2276,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 		goto leave;
 	did_quota_inode = 1;
 
-	inode->i_nlink = 0;
-	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, inode,
-				    0, &new_di_bh, parent_di_bh, handle,
-				    inode_ac);
+	status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+					      &suballoc_loc,
+					      &suballoc_bit, di_blkno);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+	inode->i_nlink = 0;
+	/* do the real work now. */
+	status = __ocfs2_mknod_locked(dir, inode,
+				      0, &new_di_bh, parent_di_bh, handle,
+				      inode_ac, di_blkno, suballoc_loc,
+				      suballoc_bit);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a..d8408217e3b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 struct ocfs2_lock_res {
 	void                    *l_priv;
 	struct ocfs2_lock_res_ops *l_ops;
-	spinlock_t               l_lock;
+
 
 	struct list_head         l_blocked_list;
 	struct list_head         l_mask_waiters;
 
-	enum ocfs2_lock_type     l_type;
 	unsigned long		 l_flags;
 	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
-	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	struct ocfs2_dlm_lksb    l_lksb;
+	unsigned char            l_level;
+
+	/* Data packed - type enum ocfs2_lock_type */
+	unsigned char            l_type;
 
 	/* used from AST/BAST funcs. */
-	enum ocfs2_ast_action    l_action;
-	enum ocfs2_unlock_action l_unlock_action;
-	int                      l_requested;
-	int                      l_blocking;
+	/* Data packed - enum type ocfs2_ast_action */
+	unsigned char            l_action;
+	/* Data packed - enum type ocfs2_unlock_action */
+	unsigned char            l_unlock_action;
+	unsigned char            l_requested;
+	unsigned char            l_blocking;
 	unsigned int             l_pending_gen;
 
+	spinlock_t               l_lock;
+
+	struct ocfs2_dlm_lksb    l_lksb;
+
 	wait_queue_head_t        l_event;
 
 	struct list_head         l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
 
 enum ocfs2_mount_options
 {
-	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
 	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
 						   control lists */
 	OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
 	OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+	OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+						     writes */
+	OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+	OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 };
 
 #define OCFS2_OSB_SOFT_RO			0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
 	struct super_block *sb;
 	struct inode *root_inode;
 	struct inode *sys_root_inode;
-	struct inode *system_inodes[NUM_SYSTEM_INODES];
+	struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+	struct inode **local_system_inodes;
 
 	struct ocfs2_slot_info *slot_info;
 
@@ -368,6 +380,8 @@ struct ocfs2_super
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 
+	u8 osb_stackflags;
+
 	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
 	return ret;
 }
 
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
 {
 	return (osb->s_feature_incompat &
-		OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+		(OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+		 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+	if (ocfs2_clusterinfo_valid(osb) &&
+	    !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+		   OCFS2_STACK_LABEL_LEN))
+		return 1;
+	return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+	return ocfs2_o2cb_stack(osb) &&
+		(osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
 }
 
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258..c2e4f8222e2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
-					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
 #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
 
 /*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
+
+/*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
  */
@@ -235,18 +243,31 @@
 #define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 
 /* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL		(0x00000001)	/* Secure deletion */
-#define OCFS2_UNRM_FL		(0x00000002)	/* Undelete */
-#define OCFS2_COMPR_FL		(0x00000004)	/* Compress file */
-#define OCFS2_SYNC_FL		(0x00000008)	/* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL	(0x00000010)	/* Immutable file */
-#define OCFS2_APPEND_FL		(0x00000020)	/* writes to file may only append */
-#define OCFS2_NODUMP_FL		(0x00000040)	/* do not dump file */
-#define OCFS2_NOATIME_FL	(0x00000080)	/* do not update atime */
-#define OCFS2_DIRSYNC_FL	(0x00010000)	/* dirsync behaviour (directories only) */
-
-#define OCFS2_FL_VISIBLE	(0x000100FF)	/* User visible flags */
-#define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */
+#define OCFS2_SECRM_FL			FS_SECRM_FL	/* Secure deletion */
+#define OCFS2_UNRM_FL			FS_UNRM_FL	/* Undelete */
+#define OCFS2_COMPR_FL			FS_COMPR_FL	/* Compress file */
+#define OCFS2_SYNC_FL			FS_SYNC_FL	/* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL		FS_IMMUTABLE_FL	/* Immutable file */
+#define OCFS2_APPEND_FL			FS_APPEND_FL	/* writes to file may only append */
+#define OCFS2_NODUMP_FL			FS_NODUMP_FL	/* do not dump file */
+#define OCFS2_NOATIME_FL		FS_NOATIME_FL	/* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL			FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL		FS_COMPRBLK_FL	/* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL			FS_NOCOMP_FL	/* Don't compress */
+#define OCFS2_ECOMPR_FL			FS_ECOMPR_FL	/* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL			FS_BTREE_FL	/* btree format dir */
+#define OCFS2_INDEX_FL			FS_INDEX_FL	/* hash-indexed directory */
+#define OCFS2_IMAGIC_FL			FS_IMAGIC_FL	/* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL		FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL			FS_NOTAIL_FL	/* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL		FS_DIRSYNC_FL	/* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL			FS_TOPDIR_FL	/* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL		FS_RESERVED_FL	/* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE		FS_FL_USER_VISIBLE	/* User visible flags */
+#define OCFS2_FL_MODIFIABLE		FS_FL_USER_MODIFIABLE	/* User modifiable flags */
 
 /*
  * Extent record flags (e_node.leaf.flags)
@@ -279,10 +300,13 @@
 #define OCFS2_VOL_UUID_LEN		16
 #define OCFS2_MAX_VOL_LABEL_LEN		64
 
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
 #define OCFS2_STACK_LABEL_LEN		4
 #define OCFS2_CLUSTER_NAME_LEN		16
 
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK	"o2cb"
+
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
@@ -292,6 +316,11 @@
  */
 #define OCFS2_MIN_XATTR_INLINE_SIZE     256
 
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT	(0x01)
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@@ -309,6 +338,7 @@ enum {
 	USER_QUOTA_SYSTEM_INODE,
 	GROUP_QUOTA_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
 	ORPHAN_DIR_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +347,12 @@ enum {
 	TRUNCATE_LOG_SYSTEM_INODE,
 	LOCAL_USER_QUOTA_SYSTEM_INODE,
 	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
 	NUM_SYSTEM_INODES
 };
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES	\
+		(NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
 
 static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	/* Global system inodes (single copy) */
@@ -347,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 /* Parameter passed from mount.ocfs2 to module */
 #define OCFS2_HB_NONE			"heartbeat=none"
 #define OCFS2_HB_LOCAL			"heartbeat=local"
+#define OCFS2_HB_GLOBAL			"heartbeat=global"
 
 /*
  * OCFS2 directory file types.  Only the low 3 bits are used.  The
@@ -553,9 +588,21 @@ struct ocfs2_slot_map_extended {
  */
 };
 
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
 struct ocfs2_cluster_info {
 /*00*/	__u8   ci_stack[OCFS2_STACK_LABEL_LEN];
-	__le32 ci_reserved;
+	union {
+		__le32 ci_reserved;
+		struct {
+			__u8 ci_stackflags;
+			__u8 ci_reserved1;
+			__u8 ci_reserved2;
+			__u8 ci_reserved3;
+		};
+	};
 /*08*/	__u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
 /*18*/
 };
@@ -592,9 +639,9 @@ struct ocfs2_super_block {
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
 /*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
-/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
-						     stack.  Only valid
-						     with INCOMPAT flag. */
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+						     userspace or clusterinfo
+						     INCOMPAT flag set. */
 /*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
 					   for this fs*/
 	__le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a8..b46f39bf743 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
 /*
  * ioctl commands
  */
-#define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
+#define OCFS2_IOC_GETFLAGS	FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS	FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS	FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS	FS_IOC32_SETFLAGS
 
 /*
  * Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,99 @@ struct reflink_arguments {
 };
 #define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
 
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST		(50)
+#define OCFS2_TEXT_UUID_LEN		(OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC		(0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+	__u64 oi_requests;	/* Array of __u64 pointers to requests */
+	__u32 oi_count;		/* Number of requests in info_requests */
+	__u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/	__u32 ir_magic;	/* Magic number */
+	__u32 ir_code;	/* Info request code */
+	__u32 ir_size;	/* Size of request */
+	__u32 ir_flags;	/* Request flags */
+/*10*/			/* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+	struct ocfs2_info_request ic_req;
+	__u32 ic_clustersize;
+	__u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+	struct ocfs2_info_request ib_req;
+	__u32 ib_blocksize;
+	__u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+	struct ocfs2_info_request im_req;
+	__u32 im_max_slots;
+	__u32 im_pad;
+};
+
+struct ocfs2_info_label {
+	struct ocfs2_info_request il_req;
+	__u8	il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+	struct ocfs2_info_request iu_req;
+	__u8	iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+	struct ocfs2_info_request if_req;
+	__u32 if_compat_features;
+	__u32 if_incompat_features;
+	__u32 if_ro_compat_features;
+	__u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+	struct ocfs2_info_request ij_req;
+	__u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+	OCFS2_INFO_CLUSTERSIZE = 1,
+	OCFS2_INFO_BLOCKSIZE,
+	OCFS2_INFO_MAXSLOTS,
+	OCFS2_INFO_LABEL,
+	OCFS2_INFO_UUID,
+	OCFS2_INFO_FS_FEATURES,
+	OCFS2_INFO_JOURNAL_SIZE,
+	OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT	(0x00000001)	/* Cluster coherency not
+							   required. This is a hint.
+							   It is up to ocfs2 whether
+							   the request can be fulfilled
+							   without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED		(0x40000000)	/* Filesystem understood
+							   this request and
+							   filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR		(0x80000000)	/* Error happened during
+							   request handling. */
+
+#define OCFS2_IOC_INFO		_IOR('o', 5, struct ocfs2_info)
+
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9..b5f9160e93e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
 
 struct ocfs2_cow_context {
 	struct inode *inode;
+	struct file *file;
 	u32 cow_start;
 	u32 cow_len;
 	struct ocfs2_extent_tree data_et;
@@ -2436,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
 			  le32_to_cpu(rec.r_clusters)) - cpos;
 		/*
-		 * If the refcount rec already exist, cool. We just need
-		 * to check whether there is a split. Otherwise we just need
-		 * to increase the refcount.
-		 * If we will insert one, increases recs_add.
-		 *
 		 * We record all the records which will be inserted to the
 		 * same refcount block, so that we can tell exactly whether
 		 * we need a new refcount block or not.
+		 *
+		 * If we will insert a new one, this is easy and only happens
+		 * during adding refcounted flag to the extent, so we don't
+		 * have a chance of spliting. We just need one record.
+		 *
+		 * If the refcount rec already exists, that would be a little
+		 * complicated. we may have to:
+		 * 1) split at the beginning if the start pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 2) split int the end if the end pos isn't aligned.
+		 *    we need 1 more record in this case.
+		 * 3) split in the middle because of file system fragmentation.
+		 *    we need 2 more records in this case(we can't detect this
+		 *    beforehand, so always think of the worst case).
 		 */
 		if (rec.r_refcount) {
+			recs_add += 2;
 			/* Check whether we need a split at the beginning. */
 			if (cpos == start_cpos &&
 			    cpos != le64_to_cpu(rec.r_cpos))
@@ -2922,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
 	pgoff_t page_index;
-	unsigned int from, to;
+	unsigned int from, to, readahead_pages;
 	loff_t offset, end, map_end;
 	struct address_space *mapping = context->inode->i_mapping;
 
 	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
 	     new_cluster, new_len, cpos);
 
+	readahead_pages =
+		(ocfs2_cow_contig_clusters(sb) <<
+		 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
 	/*
@@ -2950,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		if (map_end & (PAGE_CACHE_SIZE - 1))
 			to = map_end & (PAGE_CACHE_SIZE - 1);
 
-		page = grab_cache_page(mapping, page_index);
+		page = find_or_create_page(mapping, page_index, GFP_NOFS);
 
 		/*
 		 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2959,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
 			BUG_ON(PageDirty(page));
 
+		if (PageReadahead(page) && context->file) {
+			page_cache_async_readahead(mapping,
+						   &context->file->f_ra,
+						   context->file,
+						   page, page_index,
+						   readahead_pages);
+		}
+
 		if (!PageUptodate(page)) {
 			ret = block_read_full_page(page, ocfs2_get_block);
 			if (ret) {
@@ -3169,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
 		if (map_end > end)
 			map_end = end;
 
-		page = grab_cache_page(context->inode->i_mapping, page_index);
+		page = find_or_create_page(context->inode->i_mapping,
+					   page_index, GFP_NOFS);
 		BUG_ON(!page);
 
 		wait_on_page_writeback(page);
@@ -3398,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 	return ret;
 }
 
+static void ocfs2_readahead_for_cow(struct inode *inode,
+				    struct file *file,
+				    u32 start, u32 len)
+{
+	struct address_space *mapping;
+	pgoff_t index;
+	unsigned long num_pages;
+	int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+	if (!file)
+		return;
+
+	mapping = file->f_mapping;
+	num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+	if (!num_pages)
+		num_pages = 1;
+
+	index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+	page_cache_sync_readahead(mapping, &file->f_ra, file,
+				  index, num_pages);
+}
+
 /*
  * Starting at cpos, try to CoW write_len clusters.  Don't CoW
  * past max_cpos.  This will stop when it runs into a hole or an
  * unrefcounted extent.
  */
 static int ocfs2_refcount_cow_hunk(struct inode *inode,
+				   struct file *file,
 				   struct buffer_head *di_bh,
 				   u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3432,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 
 	BUG_ON(cow_len == 0);
 
+	ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
 	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
 	if (!context) {
 		ret = -ENOMEM;
@@ -3453,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 	context->ref_root_bh = ref_root_bh;
 	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
 	context->get_clusters = ocfs2_di_get_clusters;
+	context->file = file;
 
 	ocfs2_init_dinode_extent_tree(&context->data_et,
 				      INODE_CACHE(inode), di_bh);
@@ -3481,6 +3530,7 @@ out:
  * clusters between cpos and cpos+write_len are safe to modify.
  */
 int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *file,
 		       struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3500,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
 			num_clusters = write_len;
 
 		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-			ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+			ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
 						      num_clusters, max_cpos);
 			if (ret) {
 				mlog_errno(ret);
@@ -4190,8 +4240,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 		goto out;
 	}
 
-	mutex_lock(&new_inode->i_mutex);
-	ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+	mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+				      OI_LS_REFLINK_TARGET);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e..c8ce46f7d8e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
 	struct rb_node rf_node;
 	u64 rf_blkno;
 	u32 rf_generation;
+	struct kref rf_getcnt;
 	struct rw_semaphore rf_sem;
 	struct ocfs2_lock_res rf_lockres;
-	struct kref rf_getcnt;
 	int rf_removed;
 
 	/* the following 4 fields are used by caching_info. */
-	struct ocfs2_caching_info rf_ci;
 	spinlock_t rf_lock;
+	struct ocfs2_caching_info rf_ci;
 	struct mutex rf_io_mutex;
 	struct super_block *rf_sb;
 };
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  u32 clusters,
 					  int *credits,
 					  int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct file *filep, struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos);
 
 typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b8..3e78db361bc 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
 			   struct ocfs2_alloc_reservation *resv,
 			   int *cstart, int *clen)
 {
-	unsigned int wanted = *clen;
-
 	if (resv == NULL || ocfs2_resmap_disabled(resmap))
 		return -ENOSPC;
 
 	spin_lock(&resv_lock);
 
-	/*
-	 * We don't want to over-allocate for temporary
-	 * windows. Otherwise, we run the risk of fragmenting the
-	 * allocation space.
-	 */
-	wanted = ocfs2_resv_window_bits(resmap, resv);
-	if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
-		wanted = *clen;
-
 	if (ocfs2_resv_empty(resv)) {
-		mlog(0, "empty reservation, find new window\n");
+		/*
+		 * We don't want to over-allocate for temporary
+		 * windows. Otherwise, we run the risk of fragmenting the
+		 * allocation space.
+		 */
+		unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
 
+		if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+			wanted = *clen;
+
+		mlog(0, "empty reservation, find new window\n");
 		/*
 		 * Try to get a window here. If it works, we must fall
 		 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949..ab4e0172cc1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 {
 	int status = 0;
 	u64 blkno;
-	unsigned long long blocks, bytes;
+	unsigned long long blocks, bytes = 0;
 	unsigned int i;
 	struct buffer_head *bh;
 
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c..19965b00c43 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
 	if (!o2hb_check_local_node_heartbeating()) {
+		if (o2hb_global_heartbeat_active())
+			mlog(ML_ERROR, "Global heartbeat not started\n");
 		rc = -EINVAL;
 		goto out;
 	}
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca068..252e7c82f92 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	p->op_this_node = -1;
 
-	lock_kernel();
 	mutex_lock(&ocfs2_control_lock);
 	file->private_data = p;
 	list_add(&p->op_list, &ocfs2_control_private_list);
 	mutex_unlock(&ocfs2_control_lock);
-	unlock_kernel();
 
 	return 0;
 }
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
 	.read    = ocfs2_control_read,
 	.write   = ocfs2_control_write,
 	.owner   = THIS_MODULE,
+	.llseek  = default_llseek,
 };
 
 static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353..5fed60de763 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
 	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
 					   to 0 when a block group is
 					   contiguous. */
+	u64		sr_bg_stable_blkno; /*
+					     * Doesn't change, always
+					     * set to target block
+					     * group descriptor
+					     * block.
+					     */
 	u64		sr_blkno;	/* The first allocated block */
 	unsigned int	sr_bit_offset;	/* The bit in the bg */
 	unsigned int	sr_bits;	/* How many bits we claimed */
 };
 
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+	if (res->sr_blkno == 0)
+		return 0;
+
+	if (res->sr_bg_blkno)
+		return res->sr_bg_blkno;
+
+	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 	brelse(ac->ac_bh);
 	ac->ac_bh = NULL;
 	ac->ac_resv = NULL;
+	if (ac->ac_find_loc_priv) {
+		kfree(ac->ac_find_loc_priv);
+		ac->ac_find_loc_priv = NULL;
+	}
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
 					  struct ocfs2_group_desc *bg,
 					  struct ocfs2_chain_list *cl,
-					  u64 p_blkno, u32 clusters)
+					  u64 p_blkno, unsigned int clusters)
 {
 	struct ocfs2_extent_list *el = &bg->bg_list;
 	struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
 	rec->e_blkno = cpu_to_le64(p_blkno);
 	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
 				  le16_to_cpu(cl->cl_bpc));
-	rec->e_leaf_clusters = cpu_to_le32(clusters);
+	rec->e_leaf_clusters = cpu_to_le16(clusters);
 	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
 	le16_add_cpu(&bg->bg_free_bits_count,
 		     clusters * le16_to_cpu(cl->cl_bpc));
@@ -1359,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	}
 
 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
 	while(num_bits--)
 		ocfs2_set_bit(bit_off++, bitmap);
 
@@ -1678,6 +1707,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	if (!ret)
 		ocfs2_bg_discontig_fix_result(ac, gd, res);
 
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
 					       res->sr_bits,
 					       le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1729,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	if (ret < 0)
 		mlog_errno(ret);
 
+out_loc_only:
 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
 
 out:
@@ -1708,7 +1747,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 {
 	int status;
 	u16 chain;
-	u32 tmp_used;
 	u64 next_group;
 	struct inode *alloc_inode = ac->ac_inode;
 	struct buffer_head *group_bh = NULL;
@@ -1770,6 +1808,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	if (!status)
 		ocfs2_bg_discontig_fix_result(ac, bg, res);
 
+	/*
+	 * sr_bg_blkno might have been changed by
+	 * ocfs2_bg_discontig_fix_result
+	 */
+	res->sr_bg_stable_blkno = group_bh->b_blocknr;
 
 	/*
 	 * Keep track of previous block descriptor read. When
@@ -1796,22 +1839,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		}
 	}
 
-	/* Ok, claim our bits now: set the info on dinode, chainlist
-	 * and then the group */
-	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(alloc_inode),
-					 ac->ac_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
+	if (ac->ac_find_loc_only)
+		goto out_loc_only;
+
+	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+						  ac->ac_bh, res->sr_bits,
+						  chain);
+	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-	fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
-	le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
-	ocfs2_journal_dirty(handle, ac->ac_bh);
-
 	status = ocfs2_block_group_set_bits(handle,
 					    alloc_inode,
 					    bg,
@@ -1826,6 +1864,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
 	     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
+out_loc_only:
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
 	brelse(group_bh);
@@ -1845,6 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	int status;
 	u16 victim, i;
 	u16 bits_left = 0;
+	u64 hint = ac->ac_last_group;
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_dinode *fe;
 
@@ -1872,7 +1912,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 		goto bail;
 	}
 
-	res->sr_bg_blkno = ac->ac_last_group;
+	res->sr_bg_blkno = hint;
 	if (res->sr_bg_blkno) {
 		/* Attempt to short-circuit the usual search mechanism
 		 * by jumping straight to the most recently used
@@ -1896,8 +1936,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 
 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 				    res, &bits_left);
-	if (!status)
+	if (!status) {
+		hint = ocfs2_group_from_res(res);
 		goto set_hint;
+	}
 	if (status < 0 && status != -ENOSPC) {
 		mlog_errno(status);
 		goto bail;
@@ -1920,8 +1962,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 		ac->ac_chain = i;
 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 					    res, &bits_left);
-		if (!status)
+		if (!status) {
+			hint = ocfs2_group_from_res(res);
 			break;
+		}
 		if (status < 0 && status != -ENOSPC) {
 			mlog_errno(status);
 			goto bail;
@@ -1936,7 +1980,7 @@ set_hint:
 		if (bits_left < min_bits)
 			ac->ac_last_group = 0;
 		else
-			ac->ac_last_group = res->sr_bg_blkno;
+			ac->ac_last_group = hint;
 	}
 
 bail:
@@ -2016,6 +2060,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
 
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_suballoc_result *res;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+	res = kzalloc(sizeof(*res), GFP_NOFS);
+	if (res == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+	/*
+	 * The handle started here is for chain relink. Alternatively,
+	 * we could just disable relink for these calls.
+	 */
+	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This will instruct ocfs2_claim_suballoc_bits and
+	 * ocfs2_search_one_group to search but save actual allocation
+	 * for later.
+	 */
+	ac->ac_find_loc_only = 1;
+
+	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ac->ac_find_loc_priv = res;
+	*fe_blkno = res->sr_blkno;
+
+out:
+	if (handle)
+		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+	if (ret)
+		kfree(res);
+
+	return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno)
+{
+	int ret;
+	u16 chain;
+	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+	/*
+	 * Since di_blkno is being passed back in, we check for any
+	 * inconsistencies which may have happened between
+	 * calls. These are code bugs as di_blkno is not expected to
+	 * change once returned from ocfs2_find_new_inode_loc()
+	 */
+	BUG_ON(res->sr_blkno != di_blkno);
+
+	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+					  res->sr_bg_stable_blkno, &bg_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	chain = le16_to_cpu(bg->bg_chain);
+
+	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+					       ac->ac_bh, res->sr_bits,
+					       chain);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_block_group_set_bits(handle,
+					 ac->ac_inode,
+					 bg,
+					 bg_bh,
+					 res->sr_bit_offset,
+					 res->sr_bits);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+	     (unsigned long long)di_blkno);
+
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+	BUG_ON(res->sr_bits != 1);
+
+	*suballoc_loc = res->sr_bg_blkno;
+	*suballoc_bit = res->sr_bit_offset;
+	ac->ac_bits_given++;
+	ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+	brelse(bg_bh);
+
+	return ret;
+}
+
 int ocfs2_claim_new_inode(handle_t *handle,
 			  struct inode *dir,
 			  struct buffer_head *parent_fe_bh,
@@ -2253,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 				(unsigned long *) undo_bg->bg_bitmap);
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+			    " count %u but claims %u are freed. num_bits %d",
+			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
+			    le16_to_cpu(bg->bg_bits),
+			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return -EROFS;
+	}
 
 	if (undo_fn)
 		jbd_unlock_bh_state(group_bh);
@@ -2567,7 +2749,8 @@ out:
  * suballoc_bit.
  */
 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
-				       u16 *suballoc_slot, u16 *suballoc_bit)
+				       u16 *suballoc_slot, u64 *group_blkno,
+				       u16 *suballoc_bit)
 {
 	int status;
 	struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2787,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
 	if (suballoc_bit)
 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+	if (group_blkno)
+		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
 
 bail:
 	brelse(inode_bh);
@@ -2621,7 +2806,8 @@ bail:
  */
 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 				   struct inode *suballoc,
-				   struct buffer_head *alloc_bh, u64 blkno,
+				   struct buffer_head *alloc_bh,
+				   u64 group_blkno, u64 blkno,
 				   u16 bit, int *res)
 {
 	struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2828,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	if (alloc_di->i_suballoc_loc)
-		bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
-	else
-		bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+	bg_blkno = group_blkno ? group_blkno :
+		   ocfs2_which_suballoc_group(blkno, bit);
 	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
 					     &group_bh);
 	if (status < 0) {
@@ -2680,6 +2864,7 @@ bail:
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 {
 	int status;
+	u64 group_blkno = 0;
 	u16 suballoc_bit = 0, suballoc_slot = 0;
 	struct inode *inode_alloc_inode;
 	struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2872,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	mlog_entry("blkno: %llu", (unsigned long long)blkno);
 
 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
-					     &suballoc_bit);
+					     &group_blkno, &suballoc_bit);
 	if (status < 0) {
 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
 		goto bail;
@@ -2715,7 +2900,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	}
 
 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
-					 blkno, suballoc_bit, res);
+					 group_blkno, blkno, suballoc_bit, res);
 	if (status < 0)
 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
 
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d..b8afabfeede 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
 	u64    ac_max_block;  /* Highest block number to allocate. 0 is
 				 is the same as ~0 - unlimited */
 
+	int    ac_find_loc_only;  /* hack for reflink operation ordering */
+	struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
 	struct ocfs2_alloc_reservation	*ac_resv;
 };
 
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  struct ocfs2_alloc_context **meta_ac);
 
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+			     struct buffer_head *parent_fe_bh,
+			     struct ocfs2_alloc_context *ac,
+			     u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+				 struct inode *dir,
+				 struct ocfs2_alloc_context *ac,
+				 u64 *suballoc_loc,
+				 u16 *suballoc_bit,
+				 u64 di_blkno);
+
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0eaa929a4db..f02c0ef3157 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -145,8 +145,7 @@ static const struct super_operations ocfs2_sops = {
 	.alloc_inode	= ocfs2_alloc_inode,
 	.destroy_inode	= ocfs2_destroy_inode,
 	.drop_inode	= ocfs2_drop_inode,
-	.clear_inode	= ocfs2_clear_inode,
-	.delete_inode	= ocfs2_delete_inode,
+	.evict_inode	= ocfs2_evict_inode,
 	.sync_fs	= ocfs2_sync_fs,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
@@ -163,6 +162,7 @@ enum {
 	Opt_nointr,
 	Opt_hb_none,
 	Opt_hb_local,
+	Opt_hb_global,
 	Opt_data_ordered,
 	Opt_data_writeback,
 	Opt_atime_quantum,
@@ -178,6 +178,8 @@ enum {
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_coherency_buffered,
+	Opt_coherency_full,
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_err,
@@ -191,6 +193,7 @@ static const match_table_t tokens = {
 	{Opt_nointr, "nointr"},
 	{Opt_hb_none, OCFS2_HB_NONE},
 	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_hb_global, OCFS2_HB_GLOBAL},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_atime_quantum, "atime_quantum=%u"},
@@ -206,6 +209,8 @@ static const match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_coherency_buffered, "coherency=buffered"},
+	{Opt_coherency_full, "coherency=full"},
 	{Opt_resv_level, "resv_level=%u"},
 	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
@@ -515,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
-		inode = osb->system_inodes[i];
+	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+		inode = osb->global_system_inodes[i];
 		if (inode) {
 			iput(inode);
-			osb->system_inodes[i] = NULL;
+			osb->global_system_inodes[i] = NULL;
 		}
 	}
 
@@ -535,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 		osb->root_inode = NULL;
 	}
 
+	if (!osb->local_system_inodes)
+		goto out;
+
+	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+		if (osb->local_system_inodes[i]) {
+			iput(osb->local_system_inodes[i]);
+			osb->local_system_inodes[i] = NULL;
+		}
+	}
+
+	kfree(osb->local_system_inodes);
+	osb->local_system_inodes = NULL;
+
+out:
 	mlog_exit(0);
 }
 
@@ -609,8 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	int ret = 0;
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
-
-	lock_kernel();
+	u32 tmp;
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
 	    !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -618,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}
 
-	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
-	    (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+	tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+		OCFS2_MOUNT_HB_NONE;
+	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
 		goto out;
@@ -718,7 +737,6 @@ unlock_osb:
 							MS_POSIXACL : 0);
 	}
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -810,23 +828,29 @@ bail:
 
 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 {
-	if (ocfs2_mount_local(osb)) {
-		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+	u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+	if (osb->s_mount_opt & hb_enabled) {
+		if (ocfs2_mount_local(osb)) {
 			mlog(ML_ERROR, "Cannot heartbeat on a locally "
 			     "mounted device.\n");
 			return -EINVAL;
 		}
-	}
-
-	if (ocfs2_userspace_stack(osb)) {
-		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+		if (ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Userspace stack expected, but "
 			     "o2cb heartbeat arguments passed to mount\n");
 			return -EINVAL;
 		}
+		if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+		     !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+		    ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+		     ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+			mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+			return -EINVAL;
+		}
 	}
 
-	if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+	if (!(osb->s_mount_opt & hb_enabled)) {
 		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
 		    !ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1212,14 +1236,12 @@ read_super_error:
 	return status;
 }
 
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
 			int flags,
 			const char *dev_name,
-			void *data,
-			struct vfsmount *mnt)
+			void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
 }
 
 static void ocfs2_kill_sb(struct super_block *sb)
@@ -1243,8 +1265,7 @@ out:
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
-	.get_sb         = ocfs2_get_sb, /* is this called when we mount
-					* the fs? */
+	.mount          = ocfs2_mount,
 	.kill_sb        = ocfs2_kill_sb,
 
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1292,6 +1313,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 {
 	int status;
 	char *p;
+	u32 tmp;
 
 	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
 		   options ? options : "(none)");
@@ -1323,7 +1345,10 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
 			break;
 		case Opt_hb_none:
-			mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+			break;
+		case Opt_hb_global:
+			mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
 			break;
 		case Opt_barrier:
 			if (match_int(&args[0], &option)) {
@@ -1439,6 +1464,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_grpquota:
 			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
 			break;
+		case Opt_coherency_buffered:
+			mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
+		case Opt_coherency_full:
+			mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+			break;
 		case Opt_acl:
 			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1478,6 +1509,15 @@ static int ocfs2_parse_options(struct super_block *sb,
 		}
 	}
 
+	/* Ensure only one heartbeat mode */
+	tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+				 OCFS2_MOUNT_HB_NONE);
+	if (hweight32(tmp) != 1) {
+		mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+		status = 0;
+		goto bail;
+	}
+
 	status = 1;
 
 bail:
@@ -1491,10 +1531,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	unsigned long opts = osb->s_mount_opt;
 	unsigned int local_alloc_megs;
 
-	if (opts & OCFS2_MOUNT_HB_LOCAL)
-		seq_printf(s, ",_netdev,heartbeat=local");
-	else
-		seq_printf(s, ",heartbeat=none");
+	if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+		seq_printf(s, ",_netdev");
+		if (opts & OCFS2_MOUNT_HB_LOCAL)
+			seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+		else
+			seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+	} else
+		seq_printf(s, ",%s", OCFS2_HB_NONE);
 
 	if (opts & OCFS2_MOUNT_NOINTR)
 		seq_printf(s, ",nointr");
@@ -1537,6 +1581,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
 		seq_printf(s, ",grpquota");
 
+	if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+		seq_printf(s, ",coherency=buffered");
+	else
+		seq_printf(s, ",coherency=full");
+
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
 	else
@@ -1641,13 +1690,9 @@ static void ocfs2_put_super(struct super_block *sb)
 {
 	mlog_entry("(0x%p)\n", sb);
 
-	lock_kernel();
-
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 
-	unlock_kernel();
-
 	mlog_exit_void();
 }
 
@@ -1991,6 +2036,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 	return 0;
 }
 
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+	int status = 0;
+	u64 max_block =
+		ocfs2_clusters_to_blocks(osb->sb,
+					 osb->osb_clusters_at_boot) - 1;
+
+	/* 32-bit block number is always OK. */
+	if (max_block <= (u32)~0ULL)
+		goto out;
+
+	/* Volume is "huge", so see if our journal is new enough to
+	   support it. */
+	if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				       OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+	      jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+					       JBD2_FEATURE_INCOMPAT_64BIT))) {
+		mlog(ML_ERROR, "The journal cannot address the entire volume. "
+		     "Enable the 'block64' journal option with tunefs.ocfs2");
+		status = -EFBIG;
+		goto out;
+	}
+
+ out:
+	return status;
+}
+
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
 				  int sector_size,
@@ -2003,6 +2078,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	struct ocfs2_journal *journal;
 	__le32 uuid_net_key;
 	struct ocfs2_super *osb;
+	u64 total_blocks;
 
 	mlog_entry_void();
 
@@ -2061,6 +2137,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
 	ocfs2_orphan_scan_init(osb);
 
 	status = ocfs2_recovery_init(osb);
@@ -2099,15 +2184,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
-	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
-		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-		     osb->max_slots);
-		status = -EINVAL;
-		goto bail;
-	}
-	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
 	osb->slot_recovery_generations =
 		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
 			GFP_KERNEL);
@@ -2150,7 +2226,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	if (ocfs2_userspace_stack(osb)) {
+	if (ocfs2_clusterinfo_valid(osb)) {
+		osb->osb_stackflags =
+			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
 		memcpy(osb->osb_cluster_stack,
 		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
 		       OCFS2_STACK_LABEL_LEN);
@@ -2215,11 +2293,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
-	    > (u32)~0UL) {
-		mlog(ML_ERROR, "Volume might try to write to blocks beyond "
-		     "what jbd can address in 32 bits.\n");
-		status = -EINVAL;
+	total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(di->i_clusters));
+
+	status = generic_check_addressable(osb->sb->s_blocksize_bits,
+					   total_blocks);
+	if (status) {
+		mlog(ML_ERROR, "Volume too large "
+		     "to mount safely on this system");
+		status = -EFBIG;
 		goto bail;
 	}
 
@@ -2381,6 +2463,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 		goto finally;
 	}
 
+	/* Now that journal has been initialized, check to make sure
+	   entire volume is addressable. */
+	status = ocfs2_journal_addressable(osb);
+	if (status)
+		goto finally;
+
 	/* If the journal was unmounted cleanly then we don't want to
 	 * recover anything. Otherwise, journal_load will do that
 	 * dirty work for us :) */
@@ -2472,7 +2560,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	kfree(osb->slot_recovery_generations);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
-	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
+	 * allocate osb->journal at the start of ocfs2_initialize_osb(),
 	 * we free it here.
 	 */
 	kfree(osb->journal);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc..9975457c981 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
 	}
 
 	/* Fast symlinks can't be large */
-	len = strlen(target);
+	len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
 	link = kzalloc(len + 1, GFP_NOFS);
 	if (!link) {
 		status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf..902efb23b6a 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 						   int type,
 						   u32 slot);
 
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-					   int type,
-					   u32 slot);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
 #endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
 		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
 }
 
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-					   int type,
-					   u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+					     int type,
+					     u32 slot)
 {
-	return slot == osb->slot_num || is_global_system_inode(type);
+	int index;
+	struct inode **local_system_inodes, **free = NULL;
+
+	BUG_ON(slot == OCFS2_INVALID_SLOT);
+	BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+	       type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+	spin_lock(&osb->osb_lock);
+	local_system_inodes = osb->local_system_inodes;
+	spin_unlock(&osb->osb_lock);
+
+	if (unlikely(!local_system_inodes)) {
+		local_system_inodes = kzalloc(sizeof(struct inode *) *
+					      NUM_LOCAL_SYSTEM_INODES *
+					      osb->max_slots,
+					      GFP_NOFS);
+		if (!local_system_inodes) {
+			mlog_errno(-ENOMEM);
+			/*
+			 * return NULL here so that ocfs2_get_sytem_file_inodes
+			 * will try to create an inode and use it. We will try
+			 * to initialize local_system_inodes next time.
+			 */
+			return NULL;
+		}
+
+		spin_lock(&osb->osb_lock);
+		if (osb->local_system_inodes) {
+			/* Someone has initialized it for us. */
+			free = local_system_inodes;
+			local_system_inodes = osb->local_system_inodes;
+		} else
+			osb->local_system_inodes = local_system_inodes;
+		spin_unlock(&osb->osb_lock);
+		if (unlikely(free))
+			kfree(free);
+	}
+
+	index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+		(type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+	return &local_system_inodes[index];
 }
 
 struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 	struct inode **arr = NULL;
 
 	/* avoid the lookup if cached in local system file array */
-	if (is_in_system_inode_array(osb, type, slot))
-		arr = &(osb->system_inodes[type]);
+	if (is_global_system_inode(type)) {
+		arr = &(osb->global_system_inodes[type]);
+	} else
+		arr = get_local_system_inode(osb, type, slot);
 
 	if (arr && ((inode = *arr) != NULL)) {
 		/* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f6180..67cd4391464 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	down_read(&oi->ip_xattr_sem);
 	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
 				    buffer_size, &xis);
 	if (ret == -ENODATA && di->i_xattr_loc)
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
-	up_read(&oi->ip_xattr_sem);
 
 	return ret;
 }
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
+	down_read(&OCFS2_I(inode)->ip_xattr_sem);
 	ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
 				     name, buffer, buffer_size);
+	up_read(&OCFS2_I(inode)->ip_xattr_sem);
 
 	ocfs2_inode_unlock(inode, 0);
 
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
 		goto out;
 	}
 
-	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+	if (!indexed)
 		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
 	else
 		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b42d6241903..393f3f659da 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -25,11 +25,10 @@ static struct buffer_head *omfs_get_bucket(struct inode *dir,
 		const char *name, int namelen, int *ofs)
 {
 	int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
-	int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
 	int bucket = omfs_hash(name, namelen, nbuckets);
 
 	*ofs = OMFS_DIR_START + bucket * 8;
-	return sb_bread(dir->i_sb, block);
+	return omfs_bread(dir->i_sb, dir->i_ino);
 }
 
 static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
@@ -42,8 +41,7 @@ static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
 	*prev_block = ~0;
 
 	while (block != ~0) {
-		bh = sb_bread(dir->i_sb,
-			clus_to_blk(OMFS_SB(dir->i_sb), block));
+		bh = omfs_bread(dir->i_sb, block);
 		if (!bh) {
 			err = -EIO;
 			goto err;
@@ -86,11 +84,10 @@ static struct buffer_head *omfs_find_entry(struct inode *dir,
 int omfs_make_empty(struct inode *inode, struct super_block *sb)
 {
 	struct omfs_sb_info *sbi = OMFS_SB(sb);
-	int block = clus_to_blk(sbi, inode->i_ino);
 	struct buffer_head *bh;
 	struct omfs_inode *oi;
 
-	bh = sb_bread(sb, block);
+	bh = omfs_bread(sb, inode->i_ino);
 	if (!bh)
 		return -ENOMEM;
 
@@ -134,7 +131,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
 	brelse(bh);
 
 	/* now set the sibling and parent pointers on the new inode */
-	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+	bh = omfs_bread(dir->i_sb, inode->i_ino);
 	if (!bh)
 		goto out;
 
@@ -190,8 +187,7 @@ static int omfs_delete_entry(struct dentry *dentry)
 	if (prev != ~0) {
 		/* found in middle of list, get list ptr */
 		brelse(bh);
-		bh = sb_bread(dir->i_sb,
-			clus_to_blk(OMFS_SB(dir->i_sb), prev));
+		bh = omfs_bread(dir->i_sb, prev);
 		if (!bh)
 			goto out;
 
@@ -224,8 +220,7 @@ static int omfs_dir_is_empty(struct inode *inode)
 	u64 *ptr;
 	int i;
 
-	bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
-			inode->i_ino));
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
 
 	if (!bh)
 		return 0;
@@ -353,8 +348,7 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
 
 	/* follow chain in this bucket */
 	while (fsblock != ~0) {
-		bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
-				fsblock));
+		bh = omfs_bread(dir->i_sb, fsblock);
 		if (!bh)
 			goto out;
 
@@ -466,7 +460,7 @@ static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	hchain = (filp->f_pos >> 20) - 1;
 	hindex = filp->f_pos & 0xfffff;
 
-	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+	bh = omfs_bread(dir->i_sb, dir->i_ino);
 	if (!bh)
 		goto out;
 
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 6e7a3291bbe..8a6d34fa668 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -50,7 +50,7 @@ int omfs_shrink_inode(struct inode *inode)
 	if (inode->i_size != 0)
 		goto out;
 
-	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+	bh = omfs_bread(inode->i_sb, next);
 	if (!bh)
 		goto out;
 
@@ -90,7 +90,7 @@ int omfs_shrink_inode(struct inode *inode)
 		if (next == ~0)
 			break;
 
-		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		bh = omfs_bread(inode->i_sb, next);
 		if (!bh)
 			goto out;
 		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -222,7 +222,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
 	struct buffer_head *bh;
 	sector_t next, offset;
 	int ret;
-	u64 new_block;
+	u64 uninitialized_var(new_block);
 	u32 max_extents;
 	int extent_count;
 	struct omfs_extent *oe;
@@ -232,7 +232,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
 	int remain;
 
 	ret = -EIO;
-	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
 	if (!bh)
 		goto out;
 
@@ -265,7 +265,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
 			break;
 
 		brelse(bh);
-		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		bh = omfs_bread(inode->i_sb, next);
 		if (!bh)
 			goto out;
 		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -312,9 +312,17 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags,
-				pagep, fsdata, omfs_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				omfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
@@ -333,7 +341,29 @@ const struct file_operations omfs_file_operations = {
 	.splice_read = generic_file_splice_read,
 };
 
+static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
 const struct inode_operations omfs_file_inops = {
+	.setattr = omfs_setattr,
 	.truncate = omfs_truncate
 };
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 089839a6cc6..e043c4cb9a9 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -19,6 +19,15 @@ MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
 MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
 MODULE_LICENSE("GPL");
 
+struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	if (block >= sbi->s_num_blocks)
+		return NULL;
+
+	return sb_bread(sb, clus_to_blk(sbi, block));
+}
+
 struct inode *omfs_new_inode(struct inode *dir, int mode)
 {
 	struct inode *inode;
@@ -93,15 +102,13 @@ static int __omfs_write_inode(struct inode *inode, int wait)
 	struct omfs_inode *oi;
 	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
 	struct buffer_head *bh, *bh2;
-	unsigned int block;
 	u64 ctime;
 	int i;
 	int ret = -EIO;
 	int sync_failed = 0;
 
 	/* get current inode since we may have written sibling ptrs etc. */
-	block = clus_to_blk(sbi, inode->i_ino);
-	bh = sb_bread(inode->i_sb, block);
+	bh = omfs_bread(inode->i_sb, inode->i_ino);
 	if (!bh)
 		goto out;
 
@@ -140,8 +147,7 @@ static int __omfs_write_inode(struct inode *inode, int wait)
 
 	/* if mirroring writes, copy to next fsblock */
 	for (i = 1; i < sbi->s_mirrors; i++) {
-		bh2 = sb_bread(inode->i_sb, block + i *
-			(sbi->s_blocksize / sbi->s_sys_blocksize));
+		bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
 		if (!bh2)
 			goto out_brelse;
 
@@ -175,9 +181,13 @@ int omfs_sync_inode(struct inode *inode)
  * called when an entry is deleted, need to clear the bits in the
  * bitmaps.
  */
-static void omfs_delete_inode(struct inode *inode)
+static void omfs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
+	if (inode->i_nlink)
+		return;
 
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_size = 0;
@@ -185,7 +195,6 @@ static void omfs_delete_inode(struct inode *inode)
 	}
 
 	omfs_clear_range(inode->i_sb, inode->i_ino, 2);
-	clear_inode(inode);
 }
 
 struct inode *omfs_iget(struct super_block *sb, ino_t ino)
@@ -193,7 +202,6 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
 	struct omfs_sb_info *sbi = OMFS_SB(sb);
 	struct omfs_inode *oi;
 	struct buffer_head *bh;
-	unsigned int block;
 	u64 ctime;
 	unsigned long nsecs;
 	struct inode *inode;
@@ -204,8 +212,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	block = clus_to_blk(sbi, ino);
-	bh = sb_bread(inode->i_sb, block);
+	bh = omfs_bread(inode->i_sb, ino);
 	if (!bh)
 		goto iget_failed;
 
@@ -284,7 +291,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static const struct super_operations omfs_sops = {
 	.write_inode	= omfs_write_inode,
-	.delete_inode	= omfs_delete_inode,
+	.evict_inode	= omfs_evict_inode,
 	.put_super	= omfs_put_super,
 	.statfs		= omfs_statfs,
 	.show_options	= generic_show_options,
@@ -319,6 +326,9 @@ static int omfs_get_imap(struct super_block *sb)
 		goto nomem;
 
 	block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+	if (block >= sbi->s_num_blocks)
+		goto nomem;
+
 	ptr = sbi->s_imap;
 	for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
 		bh = sb_bread(sb, block++);
@@ -417,7 +427,6 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct omfs_root_block *omfs_rb;
 	struct omfs_sb_info *sbi;
 	struct inode *root;
-	sector_t start;
 	int ret = -EINVAL;
 
 	save_mount_options(sb, (char *) data);
@@ -486,8 +495,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
 		get_bitmask_order(sbi->s_sys_blocksize);
 
-	start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
-	bh2 = sb_bread(sb, start);
+	bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
 	if (!bh2)
 		goto out_brelse_bh;
 
@@ -504,6 +512,21 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_brelse_bh2;
 	}
 
+	if (sbi->s_bitmap_ino != ~0ULL &&
+	    sbi->s_bitmap_ino > sbi->s_num_blocks) {
+		printk(KERN_ERR "omfs: free space bitmap location is corrupt "
+			"(%llx, total blocks %llx)\n",
+			(unsigned long long) sbi->s_bitmap_ino,
+			(unsigned long long) sbi->s_num_blocks);
+		goto out_brelse_bh2;
+	}
+	if (sbi->s_clustersize < 1 ||
+	    sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
+		printk(KERN_ERR "omfs: cluster size out of range (%d)",
+			sbi->s_clustersize);
+		goto out_brelse_bh2;
+	}
+
 	ret = omfs_get_imap(sb);
 	if (ret)
 		goto out_brelse_bh2;
@@ -529,20 +552,21 @@ out_brelse_bh2:
 out_brelse_bh:
 	brelse(bh);
 end:
+	if (ret)
+		kfree(sbi);
 	return ret;
 }
 
-static int omfs_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *m)
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+	return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
 }
 
 static struct file_system_type omfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "omfs",
-	.get_sb = omfs_get_sb,
+	.mount = omfs_mount,
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index ebe2fdbe535..7d414fef501 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -58,6 +58,7 @@ extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
 extern int omfs_shrink_inode(struct inode *inode);
 
 /* inode.c */
+extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
 extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
 extern struct inode *omfs_new_inode(struct inode *dir, int mode);
 extern int omfs_reserve_block(struct super_block *sb, sector_t block);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index 12cca245d6e..ee5e4327de9 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -17,6 +17,7 @@
 #define OMFS_EXTENT_CONT 0x40
 #define OMFS_XOR_COUNT 19
 #define OMFS_MAX_BLOCK_SIZE 8192
+#define OMFS_MAX_CLUSTER_SIZE 8
 
 struct omfs_super_block {
 	char s_fill1[256];
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e..4197b9ed023 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
+#include <linux/dnotify.h>
 
 #include "internal.h"
 
@@ -110,7 +111,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error)
-		error = security_path_truncate(&path, length, 0);
+		error = security_path_truncate(&path);
 	if (!error)
 		error = do_truncate(path.dentry, length, 0, NULL);
 
@@ -165,8 +166,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = security_path_truncate(&file->f_path, length,
-					       ATTR_MTIME|ATTR_CTIME);
+		error = security_path_truncate(&file->f_path);
 	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
@@ -367,7 +367,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -396,7 +396,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -414,7 +414,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_sb_list_add(f, inode->i_sb);
 
 	error = security_dentry_open(f, cred);
 	if (error)
@@ -721,7 +721,7 @@ cleanup_all:
 			mnt_drop_write(mnt);
 		}
 	}
-	file_kill(f);
+	file_sb_list_del(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
 cleanup_file:
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
 	/* Pick up the filp from the open intent */
 	filp = nd->intent.open.file;
 	/* Has the filesystem initialised the file for us? */
-	if (filp->f_path.dentry == NULL)
+	if (filp->f_path.dentry == NULL) {
+		path_get(&nd->path);
 		filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
 				     NULL, cred);
-	else
-		path_put(&nd->path);
+	}
 	return filp;
 }
 
@@ -888,7 +888,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
 			} else {
-				fsnotify_open(f->f_path.dentry);
+				fsnotify_open(f);
 				fd_install(fd, f);
 			}
 		}
@@ -1031,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
 
 /*
  * This is used by subsystems that don't want seekable
- * file descriptors
+ * file descriptors. The function is not supposed to ever fail, the only
+ * reason it returns an 'int' and not 'void' is so that it can be plugged
+ * directly into file_operations structure.
  */
 int nonseekable_open(struct inode *inode, struct file *filp)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012..ddb1f41376e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
 	return ret;
 }
 
-static int openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
+	return mount_single(fs_type, flags, data, openprom_fill_super)
 }
 
 static struct file_system_type openprom_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "openpromfs",
-	.get_sb		= openprom_get_sb,
+	.mount		= openprom_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 6921e7890be..fbeb697374d 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
 	nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
 		   (le32_to_cpu(dr->disc_size) >> 9);
 
-	if (name)
-		printk(" [%s]", name);
+	if (name) {
+		strlcat(state->pp_buf, " [", PAGE_SIZE);
+		strlcat(state->pp_buf, name, PAGE_SIZE);
+		strlcat(state->pp_buf, "]", PAGE_SIZE);
+	}
 	put_partition(state, slot, first_sector, nr_sects);
 	return dr;
 }
@@ -81,14 +84,14 @@ static int riscix_partition(struct parsed_partitions *state,
 	if (!rr)
 		return -1;
 
-	printk(" [RISCiX]");
+	strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
 
 
 	if (rr->magic == RISCIX_MAGIC) {
 		unsigned long size = nr_sects > 2 ? 2 : nr_sects;
 		int part;
 
-		printk(" <");
+		strlcat(state->pp_buf, " <", PAGE_SIZE);
 
 		put_partition(state, slot++, first_sect, size);
 		for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ static int riscix_partition(struct parsed_partitions *state,
 				put_partition(state, slot++,
 					le32_to_cpu(rr->part[part].start),
 					le32_to_cpu(rr->part[part].length));
-				printk("(%s)", rr->part[part].name);
+				strlcat(state->pp_buf, "(", PAGE_SIZE);
+				strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+				strlcat(state->pp_buf, ")", PAGE_SIZE);
 			}
 		}
 
-		printk(" >\n");
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 	} else {
 		put_partition(state, slot++, first_sect, nr_sects);
 	}
@@ -131,7 +136,7 @@ static int linux_partition(struct parsed_partitions *state,
 	struct linux_part *linuxp;
 	unsigned long size = nr_sects > 2 ? 2 : nr_sects;
 
-	printk(" [Linux]");
+	strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
 
 	put_partition(state, slot++, first_sect, size);
 
@@ -139,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
 	if (!linuxp)
 		return -1;
 
-	printk(" <");
+	strlcat(state->pp_buf, " <", PAGE_SIZE);
 	while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
 	       linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
 		if (slot == state->limit)
@@ -149,7 +154,7 @@ static int linux_partition(struct parsed_partitions *state,
 				 le32_to_cpu(linuxp->nr_sects));
 		linuxp ++;
 	}
-	printk(" >");
+	strlcat(state->pp_buf, " >", PAGE_SIZE);
 
 	put_dev_sector(sect);
 	return slot;
@@ -294,7 +299,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
 			break;
 		}
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	return 1;
 }
 #endif
@@ -367,7 +372,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
 		return 0;
 	}
 
-	printk(" [ICS]");
+	strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
 
 	for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
 		u32 start = le32_to_cpu(p->start);
@@ -401,7 +406,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
 	}
 
 	put_dev_sector(sect);
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	return 1;
 }
 #endif
@@ -461,7 +466,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
 		return 0;
 	}
 
-	printk(" [POWERTEC]");
+	strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
 
 	for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
 		u32 start = le32_to_cpu(p->start);
@@ -472,7 +477,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
 	}
 
 	put_dev_sector(sect);
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	return 1;
 }
 #endif
@@ -543,7 +548,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
 
 		size = get_capacity(state->bdev->bd_disk);
 		put_partition(state, slot++, start, size - start);
-		printk("\n");
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	}
 
 	return i ? 1 : 0;
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index ba443d4229f..70cbf44a156 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -69,7 +69,13 @@ int amiga_partition(struct parsed_partitions *state)
 	/* blksize is blocks per 512 byte standard block */
 	blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
 
-	printk(" RDSK (%d)", blksize * 512);	/* Be more informative */
+	{
+		char tmp[7 + 10 + 1 + 1];
+
+		/* Be more informative */
+		snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
 	blk = be32_to_cpu(rdb->rdb_PartitionList);
 	put_dev_sector(sect);
 	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -106,23 +112,27 @@ int amiga_partition(struct parsed_partitions *state)
 		{
 			/* Be even more informative to aid mounting */
 			char dostype[4];
+			char tmp[42];
+
 			__be32 *dt = (__be32 *)dostype;
 			*dt = pb->pb_Environment[16];
 			if (dostype[3] < ' ')
-				printk(" (%c%c%c^%c)",
+				snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
 					dostype[0], dostype[1],
 					dostype[2], dostype[3] + '@' );
 			else
-				printk(" (%c%c%c%c)",
+				snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
 					dostype[0], dostype[1],
 					dostype[2], dostype[3]);
-			printk("(res %d spb %d)",
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+			snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
 				be32_to_cpu(pb->pb_Environment[6]),
 				be32_to_cpu(pb->pb_Environment[4]));
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		}
 		res = 1;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 
 rdb_done:
 	return res;
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 4439ff1b6ce..9875b05e80a 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state)
 	}
 
 	pi = &rs->part[0];
-	printk (" AHDI");
+	strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
 	for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
 		struct rootsector *xrs;
 		Sector sect2;
@@ -81,7 +81,7 @@ int atari_partition(struct parsed_partitions *state)
 #ifdef ICD_PARTS
 		part_fmt = 1;
 #endif
-		printk(" XGM<");
+		strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
 		partsect = extensect = be32_to_cpu(pi->st);
 		while (1) {
 			xrs = read_part_sector(state, partsect, &sect2);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state)
 				break;
 			}
 		}
-		printk(" >");
+		strlcat(state->pp_buf, " >", PAGE_SIZE);
 	}
 #ifdef ICD_PARTS
 	if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
 		pi = &rs->icdpart[0];
 		/* sanity check: no ICD format if first partition invalid */
 		if (OK_id(pi->id)) {
-			printk(" ICD<");
+			strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
 			for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
 				/* accept only GEM,BGM,RAW,LNX,SWP partitions */
 				if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state)
 						be32_to_cpu(pi->st),
 						be32_to_cpu(pi->siz));
 			}
-			printk(" >");
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
 		}
 	}
 #endif
 	put_dev_sector(sect);
 
-	printk ("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 
 	return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5dcd4b0c553..0a8b0ad0c7e 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -164,10 +164,16 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
 	if (!state)
 		return NULL;
+	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+	if (!state->pp_buf) {
+		kfree(state);
+		return NULL;
+	}
+	state->pp_buf[0] = '\0';
 
 	state->bdev = bdev;
 	disk_name(hd, 0, state->name);
-	printk(KERN_INFO " %s:", state->name);
+	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
@@ -185,17 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 		}
 
 	}
-	if (res > 0)
+	if (res > 0) {
+		printk(KERN_INFO "%s", state->pp_buf);
+
+		free_page((unsigned long)state->pp_buf);
 		return state;
+	}
 	if (state->access_beyond_eod)
 		err = -ENOSPC;
 	if (err)
 	/* The partition is unrecognized. So report I/O errors if there were any */
 		res = err;
 	if (!res)
-		printk(" unknown partition table\n");
+		strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
 	else if (warn_no_part)
-		printk(" unable to read partition table\n");
+		strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+
+	printk(KERN_INFO "%s", state->pp_buf);
+
+	free_page((unsigned long)state->pp_buf);
 	kfree(state);
 	return ERR_PTR(res);
 }
@@ -338,6 +352,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	free_part_stats(p);
+	free_part_info(p);
 	kfree(p);
 }
 
@@ -387,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-				sector_t start, sector_t len, int flags)
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -424,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
 		dev_set_name(pdev, "%sp%d", dname, partno);
@@ -437,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free_stats;
+		goto out_free_info;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -459,7 +483,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
-	INIT_RCU_HEAD(&p->rcu_head);
 	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
@@ -468,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	return p;
 
+out_free_info:
+	free_part_info(p);
 out_free_stats:
 	free_part_stats(p);
 out_free:
@@ -500,14 +525,14 @@ void register_disk(struct gendisk *disk)
 
 	if (device_add(ddev))
 		return;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &ddev->kobj,
-				kobject_name(&ddev->kobj));
-	if (err) {
-		device_del(ddev);
-		return;
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
 	}
-#endif
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -629,6 +654,7 @@ rescan:
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
+		struct partition_meta_info *info = NULL;
 
 		size = state->parts[p].size;
 		if (!size)
@@ -662,8 +688,12 @@ rescan:
 				size = get_capacity(disk) - from;
 			}
 		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
 		part = add_partition(disk, p, from, size,
-				     state->parts[p].flags);
+				     state->parts[p].flags,
+				     &state->parts[p].info);
 		if (IS_ERR(part)) {
 			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
 			       disk->disk_name, p, -PTR_ERR(part));
@@ -724,8 +754,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	device_del(disk_to_dev(disk));
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 52f8bd39939..d68bf4dc3bc 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 
 /*
  * add_gd_partition adds a partitions details to the devices partition
@@ -12,10 +13,13 @@ struct parsed_partitions {
 		sector_t from;
 		sector_t size;
 		int flags;
+		bool has_info;
+		struct partition_meta_info info;
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
 	bool access_beyond_eod;
+	char *pp_buf;
 };
 
 static inline void *read_part_sector(struct parsed_partitions *state,
@@ -32,9 +36,12 @@ static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
 	if (n < p->limit) {
+		char tmp[1 + BDEVNAME_SIZE + 10 + 1];
+
 		p->parts[n].from = from;
 		p->parts[n].size = size;
-		printk(" %s%d", p->name, n);
+		snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+		strlcat(p->pp_buf, tmp, PAGE_SIZE);
 	}
 }
 
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 9efb2cfe241..ac0ccb5026a 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
  *
  ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
 	gpt_entry *ptes = NULL;
 	u32 i;
 	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+	u8 unparsed_guid[37];
 
 	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
 	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
 		u64 start = le64_to_cpu(ptes[i].starting_lba);
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,9 +632,29 @@ int efi_partition(struct parsed_partitions *state)
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
 				 PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		/* Instead of doing a manual swap to big endian, reuse the
+		 * common ASCII hex format as the interim.
+		 */
+		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+		part_pack_uuid(unparsed_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(sizeof(info->volname) - 1,
+				sizeof(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
 	}
 	kfree(ptes);
 	kfree(gpt);
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	return 1;
 }
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc8497643fd..d513a07f44b 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -75,6 +75,7 @@ int ibm_partition(struct parsed_partitions *state)
 	unsigned char *data;
 	Sector sect;
 	sector_t labelsect;
+	char tmp[64];
 
 	res = 0;
 	blocksize = bdev_logical_block_size(bdev);
@@ -144,13 +145,15 @@ int ibm_partition(struct parsed_partitions *state)
 			 */
 			blocksize = label->cms.block_size;
 			if (label->cms.disk_offset != 0) {
-				printk("CMS1/%8s(MDSK):", name);
+				snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
 				/* disk is reserved minidisk */
 				offset = label->cms.disk_offset;
 				size = (label->cms.block_count - 1)
 					* (blocksize >> 9);
 			} else {
-				printk("CMS1/%8s:", name);
+				snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
 				offset = (info->label_block + 1);
 				size = label->cms.block_count
 					* (blocksize >> 9);
@@ -159,7 +162,8 @@ int ibm_partition(struct parsed_partitions *state)
 				      size-offset*(blocksize >> 9));
 		} else {
 			if (strncmp(type, "LNX1", 4) == 0) {
-				printk("LNX1/%8s:", name);
+				snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
 				if (label->lnx.ldl_version == 0xf2) {
 					fmt_size = label->lnx.formatted_blocks
 						* (blocksize >> 9);
@@ -178,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state)
 				offset = (info->label_block + 1);
 			} else {
 				/* unlabeled disk */
-				printk("(nonl)");
+				strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
 				size = i_size >> 9;
 				offset = (info->label_block + 1);
 			}
@@ -197,7 +201,8 @@ int ibm_partition(struct parsed_partitions *state)
 		 * if not, something is wrong, skipping partition detection
 		 */
 		if (strncmp(type, "VOL1",  4) == 0) {
-			printk("VOL1/%8s:", name);
+			snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
 			/*
 			 * get block number and read then go through format1
 			 * labels
@@ -253,7 +258,7 @@ int ibm_partition(struct parsed_partitions *state)
 
 	}
 
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	goto out_freeall;
 
 
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 1cc928bb762..0ea19312706 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	put_dev_sector(sect);
 	return 1;
 }
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 648c9d8f335..789c625c7aa 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
  *
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License as published by the Free Software
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 		return false;
 	}
 
-	printk (" [LDM]");
+	strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
 
 	/* Create the data partitions */
 	list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 		part_num++;
 	}
 
-	printk ("\n");
+	strlcat(pp->pp_buf, "\n", PAGE_SIZE);
 	return true;
 }
 
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d8..374242c0971 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
  * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 74465ff7c26..68d6a216ee7 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -59,7 +59,7 @@ int mac_partition(struct parsed_partitions *state)
 		put_dev_sector(sect);
 		return 0;		/* not a MacOS disk */
 	}
-	printk(" [mac]");
+	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
 	blocks_in_map = be32_to_cpu(part->map_count);
 	for (blk = 1; blk <= blocks_in_map; ++blk) {
 		int pos = blk * secsize;
@@ -128,6 +128,6 @@ int mac_partition(struct parsed_partitions *state)
 #endif
 
 	put_dev_sector(sect);
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	return 1;
 }
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 15bfb7b1e04..5f79a6677c6 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -213,10 +213,18 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 		put_dev_sector(sect);
 		return;
 	}
-	printk(" %s%d: <solaris:", state->name, origin);
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
 	if (le32_to_cpu(v->v_version) != 1) {
-		printk("  cannot handle version %d vtoc>\n",
-			le32_to_cpu(v->v_version));
+		char tmp[64];
+
+		snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
+			 le32_to_cpu(v->v_version));
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		put_dev_sector(sect);
 		return;
 	}
@@ -224,9 +232,12 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 	max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
 	for (i=0; i<max_nparts && state->next<state->limit; i++) {
 		struct solaris_x86_slice *s = &v->v_slice[i];
+		char tmp[3 + 10 + 1 + 1];
+
 		if (s->s_size == 0)
 			continue;
-		printk(" [s%d]", i);
+		snprintf(tmp, sizeof(tmp), " [s%d]", i);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		/* solaris partitions are relative to current MS-DOS
 		 * one; must add the offset of the current partition */
 		put_partition(state, state->next++,
@@ -234,7 +245,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 				 le32_to_cpu(s->s_size));
 	}
 	put_dev_sector(sect);
-	printk(" >\n");
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
 
@@ -250,6 +261,7 @@ static void parse_bsd(struct parsed_partitions *state,
 	Sector sect;
 	struct bsd_disklabel *l;
 	struct bsd_partition *p;
+	char tmp[64];
 
 	l = read_part_sector(state, offset + 1, &sect);
 	if (!l)
@@ -258,7 +270,9 @@ static void parse_bsd(struct parsed_partitions *state,
 		put_dev_sector(sect);
 		return;
 	}
-	printk(" %s%d: <%s:", state->name, origin, flavour);
+
+	snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
 
 	if (le16_to_cpu(l->d_npartitions) < max_partitions)
 		max_partitions = le16_to_cpu(l->d_npartitions);
@@ -275,16 +289,18 @@ static void parse_bsd(struct parsed_partitions *state,
 			/* full parent partition, we have it already */
 			continue;
 		if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
-			printk("bad subpartition - ignored\n");
+			strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
 			continue;
 		}
 		put_partition(state, state->next++, bsd_start, bsd_size);
 	}
 	put_dev_sector(sect);
-	if (le16_to_cpu(l->d_npartitions) > max_partitions)
-		printk(" (ignored %d more)",
-		       le16_to_cpu(l->d_npartitions) - max_partitions);
-	printk(" >\n");
+	if (le16_to_cpu(l->d_npartitions) > max_partitions) {
+		snprintf(tmp, sizeof(tmp), " (ignored %d more)",
+			 le16_to_cpu(l->d_npartitions) - max_partitions);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 }
 #endif
 
@@ -333,7 +349,12 @@ static void parse_unixware(struct parsed_partitions *state,
 		put_dev_sector(sect);
 		return;
 	}
-	printk(" %s%d: <unixware:", state->name, origin);
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
 	p = &l->vtoc.v_slice[1];
 	/* I omit the 0th slice as it is the same as whole disk. */
 	while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -347,7 +368,7 @@ static void parse_unixware(struct parsed_partitions *state,
 		p++;
 	}
 	put_dev_sector(sect);
-	printk(" >\n");
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
 
@@ -376,8 +397,10 @@ static void parse_minix(struct parsed_partitions *state,
 	 * the normal boot sector. */
 	if (msdos_magic_present (data + 510) &&
 	    SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
 
-		printk(" %s%d: <minix:", state->name, origin);
+		snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
 			if (state->next == state->limit)
 				break;
@@ -386,7 +409,7 @@ static void parse_minix(struct parsed_partitions *state,
 				put_partition(state, state->next++,
 					      start_sect(p), nr_sects(p));
 		}
-		printk(" >\n");
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 	}
 	put_dev_sector(sect);
 #endif /* CONFIG_MINIX_SUBPARTITION */
@@ -425,7 +448,7 @@ int msdos_partition(struct parsed_partitions *state)
 
 	if (aix_magic_present(state, data)) {
 		put_dev_sector(sect);
-		printk( " [AIX]");
+		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
 		return 0;
 	}
 
@@ -446,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state)
 			fb = (struct fat_boot_sector *) data;
 			if (slot == 1 && fb->reserved && fb->fats
 				&& fat_valid_media(fb->media)) {
-				printk("\n");
+				strlcat(state->pp_buf, "\n", PAGE_SIZE);
 				put_dev_sector(sect);
 				return 1;
 			} else {
@@ -491,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state)
 			n = min(size, max(sector_size, n));
 			put_partition(state, slot, start, n);
 
-			printk(" <");
+			strlcat(state->pp_buf, " <", PAGE_SIZE);
 			parse_extended(state, start, size);
-			printk(" >");
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
 			continue;
 		}
 		put_partition(state, slot, start, size);
 		if (SYS_IND(p) == LINUX_RAID_PARTITION)
 			state->parts[slot].flags = ADDPART_FLAG_RAID;
 		if (SYS_IND(p) == DM6_PARTITION)
-			printk("[DM]");
+			strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
 		if (SYS_IND(p) == EZD_PARTITION)
-			printk("[EZD]");
+			strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
 	}
 
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 
 	/* second pass - output for each on a separate line */
 	p = (struct partition *) (0x1be + data);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index fc22b85d436..48cec7cbca1 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state)
 				le32_to_cpu(partition->p_size));
 		slot++;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	put_dev_sector(sect);
 	return 1;
 }
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index 43b1df9aa16..ea8a86dceaf 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	put_dev_sector(sect);
 	return 1;
 }
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index a32660e25f7..b5b6fcfb3d3 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	put_dev_sector(sect);
 	return 1;
 }
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 9030c864428..9627ccffc1c 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -54,6 +54,7 @@ int sysv68_partition(struct parsed_partitions *state)
 	unsigned char *data;
 	struct dkblk0 *b;
 	struct slice *slice;
+	char tmp[64];
 
 	data = read_part_sector(state, 0, &sect);
 	if (!data)
@@ -73,7 +74,8 @@ int sysv68_partition(struct parsed_partitions *state)
 		return -1;
 
 	slices -= 1; /* last slice is the whole disk */
-	printk("sysV68: %s(s%u)", state->name, slices);
+	snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
 	slice = (struct slice *)data;
 	for (i = 0; i < slices; i++, slice++) {
 		if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state)
 			put_partition(state, slot,
 				be32_to_cpu(slice->blkoff),
 				be32_to_cpu(slice->nblocks));
-			printk("(s%u)", i);
+			snprintf(tmp, sizeof(tmp), "(s%u)", i);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		}
 		slot++;
 	}
-	printk("\n");
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
 	put_dev_sector(sect);
 	return 1;
 }
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index db9eef26036..8dbaf9f77a9 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
 					      label->pt_part[i].pi_blkoff,
 					      label->pt_part[i].pi_nblocks);
 		put_dev_sector(sect);
-		printk ("\n");
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
 		return 1;
 	} else {
 		put_dev_sector(sect);
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51..a8012a95572 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 			error = ops->confirm(pipe, buf);
 			if (error) {
 				if (!ret)
-					error = ret;
+					ret = error;
 				break;
 			}
 
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
 	if (!inode)
 		goto fail_inode;
 
+	inode->i_ino = get_next_ino();
+
 	pipe = alloc_pipe_info(inode);
 	if (!pipe)
 		goto fail_iput;
@@ -1245,16 +1247,15 @@ out:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-static int pipefs_get_sb(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data,
-			 struct vfsmount *mnt)
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+	return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
 	.name		= "pipefs",
-	.get_sb		= pipefs_get_sb,
+	.mount		= pipefs_mount,
 	.kill_sb	= kill_anon_super,
 };
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a8314..8066b8dd748 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
 	return 0;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
 	if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
 		prev_src_mnt  = child;
 	}
 out:
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
  * other mounts its parent propagates to.
  * Check if any of these mounts that **do not have submounts**
  * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
  */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
  * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
  */
 int propagate_umount(struct list_head *list)
 {
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f0..6a0068841d9 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
 	depends on PROC_FS && MMU
 
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
+	bool "/proc/vmcore support"
+	depends on PROC_FS && CRASH_DUMP
 	default y
         help
         Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c6815..2758e2afc51 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux proc filesystem routines.
 #
 
-obj-$(CONFIG_PROC_FS) += proc.o
+obj-y   += proc.o
 
 proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef80ea4..f3d02ca461e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -148,18 +149,13 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 	return count;
 }
 
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
 {
-	struct fs_struct *fs;
 	int result = -ENOENT;
 
 	task_lock(task);
-	fs = task->fs;
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = root ? fs->root : fs->pwd;
-		path_get(path);
-		read_unlock(&fs->lock);
+	if (task->fs) {
+		get_fs_root(task->fs, root);
 		result = 0;
 	}
 	task_unlock(task);
@@ -172,7 +168,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 0);
+		task_lock(task);
+		if (task->fs) {
+			get_fs_pwd(task->fs, path);
+			result = 0;
+		}
+		task_unlock(task);
 		put_task_struct(task);
 	}
 	return result;
@@ -184,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 1);
+		result = get_task_root(task, path);
 		put_task_struct(task);
 	}
 	return result;
@@ -225,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
 
-	if (mutex_lock_killable(&task->cred_guard_mutex))
+	if (mutex_lock_killable(&task->signal->cred_guard_mutex))
 		return NULL;
 
 	mm = get_task_mm(task);
@@ -234,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		mmput(mm);
 		mm = NULL;
 	}
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 
 	return mm;
 }
@@ -427,17 +428,14 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
-/* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
-	struct timespec uptime;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = badness(task, uptime.tv_sec);
+		points = oom_badness(task, NULL, NULL,
+					totalram_pages + total_swap_pages);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -561,9 +559,19 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
-	return error;
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct inode_operations proc_def_inode_operations = {
@@ -589,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-		if (ns && get_fs_path(task, &root, 1) == 0)
+		if (ns && get_task_root(task, &root) == 0)
 			ret = 0;
 		put_task_struct(task);
 	}
@@ -763,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
 	file->private_data = (void*)((long)current->self_exec_id);
+	/* OK to pass negative loff_t, we can catch out-of-range */
+	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 	return 0;
 }
 
@@ -1015,36 +1025,74 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
 
 	err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
 	if (err)
-		return -EINVAL;
+		goto out;
 	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-	     oom_adjust != OOM_DISABLE)
-		return -EINVAL;
+	     oom_adjust != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
 	if (!lock_task_sighand(task, &flags)) {
-		put_task_struct(task);
-		return -ESRCH;
+		err = -ESRCH;
+		goto err_task_lock;
 	}
 
 	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		unlock_task_sighand(task, &flags);
-		put_task_struct(task);
-		return -EACCES;
+		err = -EACCES;
+		goto err_sighand;
 	}
 
-	task->signal->oom_adj = oom_adjust;
+	if (oom_adjust != task->signal->oom_adj) {
+		if (oom_adjust == OOM_DISABLE)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_adj == OOM_DISABLE)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
 
+	/*
+	 * Warn that /proc/pid/oom_adj is deprecated, see
+	 * Documentation/feature-removal-schedule.txt.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+			"please use /proc/%d/oom_score_adj instead.\n",
+			current->comm, task_pid_nr(current),
+			task_pid_nr(task), task_pid_nr(task));
+	task->signal->oom_adj = oom_adjust;
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (task->signal->oom_adj == OOM_ADJUST_MAX)
+		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+	else
+		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+								-OOM_DISABLE;
+err_sighand:
 	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
 	put_task_struct(task);
-
-	return count;
+out:
+	return err < 0 ? err : count;
 }
 
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1053,6 +1101,106 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	long oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		goto out;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	if (oom_score_adj < task->signal->oom_score_adj &&
+			!capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	if (oom_score_adj != task->signal->oom_score_adj) {
+		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_inc(&task->mm->oom_disable_count);
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			atomic_dec(&task->mm->oom_disable_count);
+	}
+	task->signal->oom_score_adj = oom_score_adj;
+	/*
+	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+	 * always attainable.
+	 */
+	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		task->signal->oom_adj = OOM_DISABLE;
+	else
+		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+							OOM_SCORE_ADJ_MAX;
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+	.llseek		= default_llseek,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1426,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
 	if (!tmp)
 		return -ENOMEM;
 
-	pathname = d_path(path, tmp, PAGE_SIZE);
+	pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
 	len = PTR_ERR(pathname);
 	if (IS_ERR(pathname))
 		goto out;
@@ -1500,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 
 	/* Common stuff */
 	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &proc_def_inode_operations;
 
@@ -1939,11 +2088,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 static const struct file_operations proc_fdinfo_file_operations = {
 	.open           = nonseekable_open,
 	.read		= proc_fdinfo_read,
+	.llseek		= no_llseek,
 };
 
 static const struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfd,
+	.llseek		= default_llseek,
 };
 
 /*
@@ -2012,6 +2163,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_fdinfo_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfdinfo,
+	.llseek		= default_llseek,
 };
 
 /*
@@ -2202,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 		goto out_free;
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 	if (length < 0)
 		goto out_free;
 
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
@@ -2243,6 +2395,7 @@ static int proc_attr_dir_readdir(struct file * filp,
 static const struct file_operations proc_attr_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_attr_dir_readdir,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2442,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 
 	/* Initialize the inode */
 	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
 	/*
@@ -2575,7 +2729,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("auxv",       S_IRUSR, proc_pid_auxv),
 	ONE("status",     S_IRUGO, proc_pid_status),
 	ONE("personality", S_IRUSR, proc_pid_personality),
-	INF("limits",	  S_IRUSR, proc_pid_limits),
+	INF("limits",	  S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -2625,6 +2779,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -2650,6 +2805,7 @@ static int proc_tgid_base_readdir(struct file * filp,
 static const struct file_operations proc_tgid_base_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_tgid_base_readdir,
+	.llseek		= default_llseek,
 };
 
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -2910,7 +3066,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	INF("auxv",      S_IRUSR, proc_pid_auxv),
 	ONE("status",    S_IRUGO, proc_pid_status),
 	ONE("personality", S_IRUSR, proc_pid_personality),
-	INF("limits",	 S_IRUSR, proc_pid_limits),
+	INF("limits",	 S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -2959,6 +3115,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
@@ -2986,6 +3143,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_tid_base_readdir,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3222,4 +3380,5 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_task_readdir,
+	.llseek		= default_llseek,
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2791907744e..dd29f033766 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -258,17 +259,22 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 
 	error = inode_change_ok(inode, iattr);
 	if (error)
-		goto out;
+		return error;
 
-	error = inode_setattr(inode, iattr);
-	if (error)
-		goto out;
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 	
 	de->uid = inode->i_uid;
 	de->gid = inode->i_gid;
 	de->mode = inode->i_mode;
-out:
-	return error;
+	return 0;
 }
 
 static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index aea8502e58a..9c2b5f48487 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -25,11 +25,12 @@
 
 #include "internal.h"
 
-static void proc_delete_inode(struct inode *inode)
+static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
 
 	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 
 	/* Stop tracking associated processes */
 	put_pid(PROC_I(inode)->pid);
@@ -40,7 +41,6 @@ static void proc_delete_inode(struct inode *inode)
 		pde_put(de);
 	if (PROC_I(inode)->sysctl)
 		sysctl_head_put(PROC_I(inode)->sysctl);
-	clear_inode(inode);
 }
 
 struct vfsmount *proc_mnt;
@@ -91,7 +91,7 @@ static const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
 	.destroy_inode	= proc_destroy_inode,
 	.drop_inode	= generic_delete_inode,
-	.delete_inode	= proc_delete_inode,
+	.evict_inode	= proc_evict_inode,
 	.statfs		= simple_statfs,
 };
 
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 {
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	long rv = -ENOTTY;
-	long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
-	int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+	long (*ioctl)(struct file *, unsigned int, unsigned long);
 
 	spin_lock(&pde->pde_unload_lock);
 	if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 		return rv;
 	}
 	pde->pde_users++;
-	unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
-	ioctl = pde->proc_fops->ioctl;
+	ioctl = pde->proc_fops->unlocked_ioctl;
 	spin_unlock(&pde->pde_unload_lock);
 
-	if (unlocked_ioctl) {
-		rv = unlocked_ioctl(file, cmd, arg);
-		if (rv == -ENOIOCTLCMD)
-			rv = -EINVAL;
-	} else if (ioctl) {
-		WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
-			  "%pf will be called without the Bkl held\n", ioctl);
-		rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-	}
+	if (ioctl)
+		rv = ioctl(file, cmd, arg);
 
 	pde_users_dec(pde);
 	return rv;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd6..3b8b4566033 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
 #endif
 
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
 #endif
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a1..b652cb00906 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	if (!inode)
 		goto out;
 
+	inode->i_ino = get_next_ino();
+
 	sysctl_head_get(head);
 	ei = PROC_I(inode);
 	ei->sysctl = head;
@@ -329,10 +331,19 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
 
-	return error;
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -355,6 +366,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 static const struct file_operations proc_sys_file_operations = {
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
+	.llseek		= default_llseek,
 };
 
 static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22..ef9fa8e24ad 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-static int proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *proc_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	int err;
 	struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	if (!sb->s_root) {
 		sb->s_flags = flags;
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
-			return err;
+			return ERR_PTR(err);
 		}
 
 		ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		}
 
 		sb->s_flags |= MS_ACTIVE;
-		ns->proc_mnt = mnt;
 	}
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
 static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
 
 static struct file_system_type proc_fs_type = {
 	.name		= "proc",
-	.get_sb		= proc_get_sb,
+	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
 };
 
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
+	init_pid_ns.proc_mnt = proc_mnt;
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
 static const struct file_operations proc_root_operations = {
 	.read		 = generic_read_dir,
 	.readdir	 = proc_root_readdir,
+	.llseek		= default_llseek,
 };
 
 /*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
+	ns->proc_mnt = mnt;
 	return 0;
 }
 
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f1..37994737c98 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
 	int i, j;
 
-	seq_printf(p, "                ");
+	seq_printf(p, "                    ");
 	for_each_possible_cpu(i)
 		seq_printf(p, "CPU%-8d", i);
 	seq_printf(p, "\n");
 
 	for (i = 0; i < NR_SOFTIRQS; i++) {
-		seq_printf(p, "%8s:", softirq_to_name[i]);
+		seq_printf(p, "%12s:", softirq_to_name[i]);
 		for_each_possible_cpu(j)
 			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
 		seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc27..e15a19c93ba 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
-	unsigned int per_irq_sum;
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		guest_nice = cputime64_add(guest_nice,
 			kstat_cpu(i).cpustat.guest_nice);
-		for_each_irq_nr(j) {
-			sum += kstat_irqs_cpu(j, i);
-		}
+		sum += kstat_cpu_irqs_sum(i);
 		sum += arch_irq_stat_cpu(i);
 
 		for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
-		per_irq_sum = 0;
-		for_each_possible_cpu(i)
-			per_irq_sum += kstat_irqs_cpu(j, i);
-
-		seq_printf(p, " %u", per_irq_sum);
-	}
+	for_each_irq_nr(j)
+		seq_printf(p, " %u", kstat_irqs(j));
 
 	seq_printf(p,
 		"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea1d3f1ffb..da6b01d70f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	int flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
+	unsigned long start;
 	dev_t dev = 0;
 	int len;
 
@@ -220,8 +221,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 	}
 
+	/* We don't show the stack guard page in /proc/maps */
+	start = vma->vm_start;
+	if (vma->vm_flags & VM_GROWSDOWN)
+		if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
+			start += PAGE_SIZE;
+
 	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-			vma->vm_start,
+			start,
 			vma->vm_end,
 			flags & VM_READ ? 'r' : '-',
 			flags & VM_WRITE ? 'w' : '-',
@@ -320,6 +327,7 @@ struct mem_size_stats {
 	unsigned long private_clean;
 	unsigned long private_dirty;
 	unsigned long referenced;
+	unsigned long anonymous;
 	unsigned long swap;
 	u64 pss;
 };
@@ -350,19 +358,22 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (!page)
 			continue;
 
+		if (PageAnon(page))
+			mss->anonymous += PAGE_SIZE;
+
 		mss->resident += PAGE_SIZE;
 		/* Accumulate the size in pages that have been accessed. */
 		if (pte_young(ptent) || PageReferenced(page))
 			mss->referenced += PAGE_SIZE;
 		mapcount = page_mapcount(page);
 		if (mapcount >= 2) {
-			if (pte_dirty(ptent))
+			if (pte_dirty(ptent) || PageDirty(page))
 				mss->shared_dirty += PAGE_SIZE;
 			else
 				mss->shared_clean += PAGE_SIZE;
 			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
 		} else {
-			if (pte_dirty(ptent))
+			if (pte_dirty(ptent) || PageDirty(page))
 				mss->private_dirty += PAGE_SIZE;
 			else
 				mss->private_clean += PAGE_SIZE;
@@ -403,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Clean:  %8lu kB\n"
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
 		   "MMUPageSize:    %8lu kB\n",
@@ -414,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_clean >> 10,
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
+		   mss.anonymous >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
 		   vma_mmu_pagesize(vma) >> 10);
@@ -532,6 +545,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 
 const struct file_operations proc_clear_refs_operations = {
 	.write		= clear_refs_write,
+	.llseek		= noop_llseek,
 };
 
 struct pagemapread {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c..2367fb3f70b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 
 static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
-	.llseek		= generic_file_llseek,
+	.llseek		= default_llseek,
 };
 
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a..7b0329468a5 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
 	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
 
-	lock_kernel();
-
 	while (filp->f_pos < inode->i_size) {
 		blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
 		bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		brelse(bh);
 	}
 out:
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 277575ddc05..fcada42f1aa 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	lock_kernel();
-
 	buf->f_type    = sb->s_magic;
 	buf->f_bsize   = sb->s_blocksize;
 	buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
-	unlock_kernel();
-
 	return 0;
 }
 
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
  		goto outi;
 
 	brelse(bh);
-
 	return 0;
 
       outi:
@@ -320,10 +314,19 @@ static int qnx4_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				qnx4_get_block,
 				&qnx4_inode->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
@@ -451,17 +454,16 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
-static int qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
 }
 
 static struct file_system_type qnx4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "qnx4",
-	.get_sb		= qnx4_get_sb,
+	.mount		= qnx4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba87..275327b5615 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	int len = dentry->d_name.len;
 	struct inode *foundinode = NULL;
 
-	lock_kernel();
 	if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
 		goto out;
 	/* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
 
 	foundinode = qnx4_iget(dir->i_sb, ino);
 	if (IS_ERR(foundinode)) {
-		unlock_kernel();
 		QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
 			   PTR_ERR(foundinode)));
 		return ERR_CAST(foundinode);
 	}
 out:
-	unlock_kernel();
 	d_add(dentry, foundinode);
 
 	return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3..880fd988436 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
 
 config QUOTA
 	bool "Quota support"
+	select QUOTACTL
 	help
 	  If you say Y here, you will be able to set per user limits for disk
 	  usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
 
 config QUOTACTL
 	bool
-	depends on XFS_QUOTA || QUOTA
-	default y
+	default n
 
 config QUOTACTL_COMPAT
 	bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 437d2ca2de9..0fed41e6efc 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
+void __quota_error(struct super_block *sb, const char *func,
+		  const char *fmt, ...)
+{
+	va_list args;
+
+	if (printk_ratelimit()) {
+		va_start(args, fmt);
+		printk(KERN_ERR "Quota error (device %s): %s: ",
+		       sb->s_id, func);
+		vprintk(fmt, args);
+		printk("\n");
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL(__quota_error);
+
 #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
 #endif
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
 		return;
 #ifdef CONFIG_QUOTA_DEBUG
 	if (!atomic_read(&dquot->dq_count)) {
-		printk("VFS: dqput: trying to free free dquot\n");
-		printk("VFS: device %s, dquot of %s %d\n",
-			dquot->dq_sb->s_id,
-			quotatypes[dquot->dq_type],
-			dquot->dq_id);
+		quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
+			    quotatypes[dquot->dq_type], dquot->dq_id);
 		BUG();
 	}
 #endif
@@ -732,9 +745,9 @@ we_slept:
 		/* Commit dquot before releasing */
 		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: cannot write quota structure on "
-				"device %s (error %d). Quota may get out of "
-				"sync!\n", dquot->dq_sb->s_id, ret);
+			quota_error(dquot->dq_sb, "Can't write quota structure"
+				    " (error %d). Quota may get out of sync!",
+				    ret);
 			/*
 			 * We clear dirty bit anyway, so that we avoid
 			 * infinite loop here
@@ -885,7 +898,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
-		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
-			" was turned on thus quota information is probably "
-			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+		quota_error(sb, "Writes happened before quota was turned on "
+			"thus quota information is probably inconsistent. "
+			"Please run quotacheck(8)");
 	}
 #endif
 }
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 		if (dqput_blocks(dquot)) {
 #ifdef CONFIG_QUOTA_DEBUG
 			if (atomic_read(&dquot->dq_count) != 1)
-				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
+				quota_error(inode->i_sb, "Adding dquot with "
+					    "dq_count %d to dispose list",
+					    atomic_read(&dquot->dq_count));
 #endif
 			spin_lock(&dq_list_lock);
 			/* As dquot must have currently users it can't be on
@@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 		struct list_head *tofree_head)
 {
 	struct inode *inode;
+	int reserved = 0;
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 		 *  only quota pointers and these have separate locking
 		 *  (dqptr_sem).
 		 */
-		if (!IS_NOQUOTA(inode))
+		if (!IS_NOQUOTA(inode)) {
+			if (unlikely(inode_get_rsv_space(inode) > 0))
+				reserved = 1;
 			remove_inode_dquot_ref(inode, type, tofree_head);
+		}
 	}
 	spin_unlock(&inode_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+			" was disabled thus quota information is probably "
+			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+	}
+#endif
 }
 
 /* Gather all references from inodes and drop them */
@@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 	return QUOTA_NL_NOWARN;
 }
 
+static int dquot_active(const struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
+
 /*
  * Initialize quota pointers in inode
  *
@@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	/* First get references to structures we might need. */
@@ -1351,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
+		/* We could race with quotaon or dqget() could have failed */
+		if (!got[cnt])
+			continue;
 		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
 			got[cnt] = NULL;
@@ -1507,7 +1545,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	 * First test before acquiring mutex - solves deadlocks when we
 	 * re-enter the quota code and are already holding the mutex
 	 */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
 	}
@@ -1559,7 +1597,7 @@ int dquot_alloc_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1596,7 +1634,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	int cnt;
 
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_claim_rsv_space(inode, number);
 		return 0;
 	}
@@ -1629,7 +1667,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_decr_space(inode, number, reserve);
 		return;
 	}
@@ -1667,7 +1705,7 @@ void dquot_free_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1701,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	qsize_t rsv_space = 0;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
 	int cnt, ret = 0;
+	char is_valid[MAXQUOTAS] = {};
 	char warntype_to[MAXQUOTAS];
 	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
@@ -1722,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		/*
+		 * Skip changes for same uid or gid or for turned off quota-type.
+		 */
 		if (!transfer_to[cnt])
 			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(inode->i_sb, cnt))
+			continue;
+		is_valid[cnt] = 1;
 		transfer_from[cnt] = inode->i_dquot[cnt];
 		ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
 		if (ret)
@@ -1737,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	 * Finally perform the needed transfer from transfer_from to transfer_to
 	 */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		/*
-		 * Skip changes for same uid or gid or for turned off quota-type.
-		 */
-		if (!transfer_to[cnt])
+		if (!is_valid[cnt])
 			continue;
-
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			warntype_from_inodes[cnt] =
@@ -1766,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 
 	mark_all_dquot_dirty(transfer_from);
 	mark_all_dquot_dirty(transfer_to);
-	/* Pass back references to put */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_to[cnt] = transfer_from[cnt];
-warn:
 	flush_warnings(transfer_to, warntype_to);
 	flush_warnings(transfer_from, warntype_from_inodes);
 	flush_warnings(transfer_from, warntype_from_space);
-	return ret;
+	/* Pass back references to put */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (is_valid[cnt])
+			transfer_to[cnt] = transfer_from[cnt];
+	return 0;
 over_quota:
 	spin_unlock(&dq_data_lock);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	goto warn;
+	flush_warnings(transfer_to, warntype_to);
+	return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
 
@@ -1790,7 +1833,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
-	if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 
 	if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
@@ -1957,7 +2000,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 				truncate_inode_pages(&toputinode[cnt]->i_data,
 						     0);
 				mutex_unlock(&toputinode[cnt]->i_mutex);
-				mark_inode_dirty(toputinode[cnt]);
+				mark_inode_dirty_sync(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
@@ -2270,7 +2313,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 	memset(di, 0, sizeof(*di));
 	di->d_version = FS_DQUOT_VERSION;
 	di->d_flags = dquot->dq_type == USRQUOTA ?
-			XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+			FS_USER_QUOTA : FS_GROUP_QUOTA;
 	di->d_id = dquot->dq_id;
 
 	spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb..9e48874eabc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 	if (ret != info->dqi_usable_bs) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on "
-			"dev %s\n", sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 	}
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		q_warn(KERN_ERR
-		       "VFS: Can't write block (%u) with free entries.\n",
-		       blk);
+		quota_error(info->dqi_sb, "Can't write block (%u) "
+			    "with free entries", blk);
 	return 0;
 out_buf:
 	kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
 		*err = remove_free_dqentry(info, buf, blk);
 		if (*err < 0) {
-			q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
-			       "remove block (%u) from entry free list.\n",
-			       blk);
+			quota_error(dquot->dq_sb, "Can't remove block (%u) "
+				    "from entry free list", blk);
 			goto out_buf;
 		}
 	}
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	}
 #ifdef __QUOTA_QT_PARANOIA
 	if (i == qtree_dqstr_in_blk(info)) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
-				"but it shouldn't.\n");
+		quota_error(dquot->dq_sb, "Data block full but it shouldn't");
 		*err = -EIO;
 		goto out_buf;
 	}
 #endif
 	*err = write_blk(info, blk, buf);
 	if (*err < 0) {
-		q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
-				"data block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't write quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	} else {
 		ret = read_blk(info, *treeblk, buf);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't read tree quota block "
-					"%u.\n", *treeblk);
+			quota_error(dquot->dq_sb, "Can't read tree quota "
+				    "block %u", *treeblk);
 			goto out_buf;
 		}
 	}
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
 		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota "
-					"entry (block %u).\n",
-			       le32_to_cpu(ref[get_index(info,
+			quota_error(dquot->dq_sb, "Inserting already present "
+				    "quota entry (block %u)",
+				    le32_to_cpu(ref[get_index(info,
 						dquot->dq_id, depth)]));
 			ret = -EIO;
 			goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (!dquot->dq_off) {
 		ret = dq_insert_tree(info, dquot);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Error %zd occurred while "
-					"creating quota.\n", ret);
+			quota_error(sb, "Error %zd occurred while creating "
+				    "quota", ret);
 			kfree(ddquot);
 			return ret;
 		}
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
 				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-		       sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -ENOSPC;
 	} else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (!buf)
 		return -ENOMEM;
 	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-		q_warn(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		quota_error(dquot->dq_sb, "Quota structure has offset to "
+			"other block (%u) than it should (%u)", blk,
+			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
 		goto out_buf;
 	}
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		if (ret >= 0)
 			ret = put_free_dqblk(info, buf, blk);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
+			quota_error(dquot->dq_sb, "Can't move quota data block "
+				    "(%u) to free list", blk);
 			goto out_buf;
 		}
 	} else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			/* Insert will write block itself */
 			ret = insert_free_dqentry(info, buf, blk);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't insert quota data "
-				       "block (%u) to free entry list.\n", blk);
+				quota_error(dquot->dq_sb, "Can't insert quota "
+				    "data block (%u) to free entry list", blk);
 				goto out_buf;
 			}
 		} else {
 			ret = write_blk(info, blk, buf);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "data block %u", blk);
 				goto out_buf;
 			}
 		}
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		quota_error(dquot->dq_sb, "Can't read quota data "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				q_warn(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "tree block %u", blk);
 		}
 	}
 out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		ddquot += info->dqi_entry_size;
 	}
 	if (i == qtree_dqstr_in_blk(info)) {
-		q_warn(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
+		quota_error(dquot->dq_sb, "Quota for id %u referenced "
+			    "but not present", dquot->dq_id);
 		ret = -EIO;
 		goto out_buf;
 	} else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+			    blk);
 		goto out_buf;
 	}
 	ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 #ifdef __QUOTA_QT_PARANOIA
 	/* Invalidated quota? */
 	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		quota_error(sb, "Quota invalidated while reading!");
 		return -EIO;
 	}
 #endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		offset = find_dqentry(info, dquot);
 		if (offset <= 0) {	/* Entry not present? */
 			if (offset < 0)
-				q_warn(KERN_ERR "VFS: Can't read quota "
-				  "structure for id %u.\n", dquot->dq_id);
+				quota_error(sb, "Can't read quota structure "
+					    "for id %u", dquot->dq_id);
 			dquot->dq_off = 0;
 			set_bit(DQ_FAKE_B, &dquot->dq_flags);
 			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
-		q_warn(KERN_ERR "VFS: Error while reading quota "
-				"structure for id %u.\n", dquot->dq_id);
+		quota_error(sb, "Error while reading quota structure for id %u",
+			    dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
 		kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d..a1ab8db81a5 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
 
 #define QT_TREEOFF	1		/* Offset of tree in file in blocks */
 
-#define q_warn(fmt, args...) \
-do { \
-	if (printk_ratelimit()) \
-		printk(fmt, ## args); \
-} while(0)
-
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852..34b37a67bb1 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
 			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
 			v1_dqoff(dquot->dq_id));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-			dquot->dq_sb->s_id);
+		quota_error(dquot->dq_sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af145..65444d29406 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
 	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
 				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
-		q_warn(KERN_WARNING "quota_v2: Failed header read:"
-		       " expected=%zd got=%zd\n",
-			sizeof(struct v2_disk_dqheader), size);
+		quota_error(sb, "Failed header read: expected=%zd got=%zd",
+			    sizeof(struct v2_disk_dqheader), size);
 		return 0;
 	}
 	return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't read info structure");
 		return -1;
 	}
 	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't write info structure");
 		return -1;
 	}
 	return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d532c20fc17..9eead2c796b 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -146,9 +146,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
 			return ret;
 	}
 
-	ret = simple_setsize(inode, newsize);
-
-	return ret;
+	truncate_setsize(inode, newsize);
+	return 0;
 }
 
 /*****************************************************************************/
@@ -183,7 +182,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 
-	generic_setattr(inode, ia);
+	setattr_copy(inode, ia);
  out:
 	ia->ia_valid = old_ia_valid;
 	return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6..eacb166fb25 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 	struct inode * inode = new_inode(sb);
 
 	if (inode) {
+		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
 	return err;
 }
 
-int ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
 
-static int rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
-			    mnt);
+	return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
 }
 
 static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
-	.get_sb		= ramfs_get_sb,
+	.mount		= ramfs_mount,
 	.kill_sb	= ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
-	.get_sb		= rootfs_get_sb,
+	.mount		= rootfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 9c0485236e6..431a0ed610c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+	/*
+	 * pos or pos+count is negative here, check overflow.
+	 * too big "count" will be caught in rw_verify_area().
+	 */
+	if ((pos < 0) && (pos + count < pos))
+		return -EOVERFLOW;
+	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+		return 0;
+	return -EINVAL;
+}
+
 /**
  * generic_file_llseek_unlocked - lockless generic llseek implementation
  * @file:	file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		break;
 	}
 
-	if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+	if (offset < 0 && __negative_fpos_check(file, offset, 0))
+		return -EINVAL;
+	if (offset > inode->i_sb->s_maxbytes)
 		return -EINVAL;
 
 	/* Special lock needed here? */
@@ -124,7 +140,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t retval;
 
-	lock_kernel();
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
 			offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
-	if (offset >= 0) {
+	if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
@@ -145,7 +161,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 		retval = offset;
 	}
 out:
-	unlock_kernel();
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -156,7 +172,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 
 	fn = no_llseek;
 	if (file->f_mode & FMODE_LSEEK) {
-		fn = default_llseek;
 		if (file->f_op && file->f_op->llseek)
 			fn = file->f_op->llseek;
 	}
@@ -222,13 +237,12 @@ bad:
 }
 #endif
 
+
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
  * won't have to do range checks all the time.
  */
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
-
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
 	struct inode *inode;
@@ -239,8 +253,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	if (unlikely((ssize_t) count < 0))
 		return retval;
 	pos = *ppos;
-	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
-		return retval;
+	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
+		retval = __negative_fpos_check(file, pos, count);
+		if (retval)
+			return retval;
+	}
 
 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 		retval = locks_mandatory_area(
@@ -311,7 +328,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		else
 			ret = do_sync_read(file, buf, count, pos);
 		if (ret > 0) {
-			fsnotify_access(file->f_path.dentry);
+			fsnotify_access(file);
 			add_rchar(current, ret);
 		}
 		inc_syscr(current);
@@ -367,7 +384,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		else
 			ret = do_sync_write(file, buf, count, pos);
 		if (ret > 0) {
-			fsnotify_modify(file->f_path.dentry);
+			fsnotify_modify(file);
 			add_wchar(current, ret);
 		}
 		inc_syscw(current);
@@ -565,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
 			      struct iovec **ret_pointer)
-  {
+{
 	unsigned long seg;
-  	ssize_t ret;
+	ssize_t ret;
 	struct iovec *iov = fast_pointer;
 
-  	/*
-  	 * SuS says "The readv() function *may* fail if the iovcnt argument
-  	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-  	 * traditionally returned zero for zero segments, so...
-  	 */
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
 	if (nr_segs == 0) {
 		ret = 0;
-  		goto out;
+		goto out;
 	}
 
-  	/*
-  	 * First get the "struct iovec" from user memory and
-  	 * verify all the pointers
-  	 */
+	/*
+	 * First get the "struct iovec" from user memory and
+	 * verify all the pointers
+	 */
 	if (nr_segs > UIO_MAXIOV) {
 		ret = -EINVAL;
-  		goto out;
+		goto out;
 	}
 	if (nr_segs > fast_segs) {
-  		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 		if (iov == NULL) {
 			ret = -ENOMEM;
-  			goto out;
+			goto out;
 		}
-  	}
+	}
 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 		ret = -EFAULT;
-  		goto out;
+		goto out;
 	}
 
-  	/*
+	/*
 	 * According to the Single Unix Specification we should return EINVAL
 	 * if an element length is < 0 when cast to ssize_t or if the
 	 * total length would overflow the ssize_t return value of the
 	 * system call.
-  	 */
+	 *
+	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+	 * overflow case.
+	 */
 	ret = 0;
-  	for (seg = 0; seg < nr_segs; seg++) {
-  		void __user *buf = iov[seg].iov_base;
-  		ssize_t len = (ssize_t)iov[seg].iov_len;
+	for (seg = 0; seg < nr_segs; seg++) {
+		void __user *buf = iov[seg].iov_base;
+		ssize_t len = (ssize_t)iov[seg].iov_len;
 
 		/* see if we we're about to use an invalid len or if
 		 * it's about to overflow ssize_t */
-		if (len < 0 || (ret + len < ret)) {
+		if (len < 0) {
 			ret = -EINVAL;
-  			goto out;
+			goto out;
 		}
 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 			ret = -EFAULT;
-  			goto out;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - ret) {
+			len = MAX_RW_COUNT - ret;
+			iov[seg].iov_len = len;
 		}
-
 		ret += len;
-  	}
+	}
 out:
 	*ret_pointer = iov;
 	return ret;
@@ -675,9 +698,9 @@ out:
 		kfree(iov);
 	if ((ret + (type == READ)) > 0) {
 		if (type == READ)
-			fsnotify_access(file->f_path.dentry);
+			fsnotify_access(file);
 		else
-			fsnotify_modify(file->f_path.dentry);
+			fsnotify_modify(file);
 	}
 	return ret;
 }
diff --git a/fs/readdir.c b/fs/readdir.c
index 7723401f8d8..356f71528ad 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 1995  Linus Torvalds
  */
 
+#include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/time.h>
@@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir);
  * anyway. Thus the special "fillonedir()" function for that
  * case (the low-level handlers don't need to care about this).
  */
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
 
 #ifdef __ARCH_WANT_OLD_READDIR
 
@@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
 	struct linux_dirent __user * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
 	unsigned long d_ino;
-	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long));
+	int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
+		sizeof(long));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
 {
 	struct linux_dirent64 __user *dirent;
 	struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
-	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64));
+	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+		sizeof(u64));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f..7cd46666ba2 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
 
 	  In general, ReiserFS is as fast as ext2, but is very efficient with
 	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see <http://www.namesys.com/> for links.
+	  for NFS and quotas, please see 
+	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
 
 	  It is more easily extended to have features currently found in
 	  database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
 	  plugins consistent with our motto ``It takes more than a license to
 	  make source code open.''
 
-	  Read <http://www.namesys.com/> to learn more about reiserfs.
+	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+	  to learn more about reiserfs.
 
 	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
 
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e..e2f7a264e3f 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
 [END LICENSING]
 
 Reiserfs is a file system based on balanced tree algorithms, which is
-described at http://devlinux.com/namesys.
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
 
 Stop reading here.  Go there, then return.
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index b82cdd8a45d..91f080cc76c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,20 +38,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 
 	BUG_ON(!S_ISREG(inode->i_mode));
 
+        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+		return 0;
+
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
+
+        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+		return 0;
+	}
+
 	/* fast out for when nothing needs to be done */
-	if ((atomic_read(&inode->i_count) > 1 ||
-	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
 	     !tail_has_to_be_packed(inode)) &&
 	    REISERFS_I(inode)->i_prealloc_count <= 0) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
 		return 0;
 	}
 
-	mutex_lock(&inode->i_mutex);
-
-	mutex_lock(&(REISERFS_I(inode)->i_mmap));
-	if (REISERFS_I(inode)->i_flags & i_ever_mapped)
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
 	reiserfs_write_lock(inode->i_sb);
 	/* freeing preallocation only involves relogging blocks that
 	 * are already in the current transaction.  preallocation gets
@@ -94,9 +98,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 	if (!err)
 		err = jbegin_failure;
 
-	if (!err && atomic_read(&inode->i_count) <= 1 &&
+	if (!err &&
 	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
 	    tail_has_to_be_packed(inode)) {
+
 		/* if regular file is released by last holder and it has been
 		   appended (we append by unformatted node only) or its direct
 		   item(s) had to be converted, then it may have to be
@@ -104,27 +109,28 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 		err = reiserfs_truncate_file(inode, 0);
 	}
       out:
-	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
-	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
 	return err;
 }
 
-static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int reiserfs_file_open(struct inode *inode, struct file *file)
 {
-	struct inode *inode;
-
-	inode = file->f_path.dentry->d_inode;
-	mutex_lock(&(REISERFS_I(inode)->i_mmap));
-	REISERFS_I(inode)->i_flags |= i_ever_mapped;
-	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
-
-	return generic_file_mmap(file, vma);
+	int err = dquot_file_open(inode, file);
+        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
+		/* somebody might be tailpacking on final close; wait for it */
+		mutex_lock(&(REISERFS_I(inode)->tailpack));
+		atomic_inc(&REISERFS_I(inode)->openers);
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+	}
+	return err;
 }
 
 static void reiserfs_vfs_truncate_file(struct inode *inode)
 {
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
 	reiserfs_truncate_file(inode, 1);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
 }
 
 /* Sync a reiserfs file. */
@@ -146,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
@@ -288,8 +293,8 @@ const struct file_operations reiserfs_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
 #endif
-	.mmap = reiserfs_file_mmap,
-	.open = dquot_file_open,
+	.mmap = generic_file_mmap,
+	.open = reiserfs_file_open,
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
 	.aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0f22fdaf54a..41656d40dc5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,10 +22,8 @@
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 
-void reiserfs_delete_inode(struct inode *inode)
+void reiserfs_evict_inode(struct inode *inode)
 {
 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
 	int jbegin_count =
@@ -35,10 +33,12 @@ void reiserfs_delete_inode(struct inode *inode)
 	int depth;
 	int err;
 
-	if (!is_bad_inode(inode))
+	if (!inode->i_nlink && !is_bad_inode(inode))
 		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink)
+		goto no_delete;
 
 	depth = reiserfs_write_lock_once(inode->i_sb);
 
@@ -77,9 +77,15 @@ void reiserfs_delete_inode(struct inode *inode)
 		;
 	}
       out:
-	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
+	end_writeback(inode);	/* note this must go after the journal_end to prevent deadlock */
+	dquot_drop(inode);
 	inode->i_blocks = 0;
 	reiserfs_write_unlock_once(inode->i_sb, depth);
+	return;
+
+no_delete:
+	end_writeback(inode);
+	dquot_drop(inode);
 }
 
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -157,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -431,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -550,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
-	 ** data that has been read directly into the page, and block_prepare_write
-	 ** won't trigger a get_block in this case.
+	 ** data that has been read directly into the page, and
+	 ** __block_write_begin won't trigger a get_block in this case.
 	 */
 	fix_tail_page_for_writing(tail_page);
-	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+	retval = __reiserfs_write_begin(tail_page, tail_start,
+				      tail_end - tail_start);
 	if (retval)
 		goto unlock;
 
@@ -1138,7 +1145,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
-	mutex_init(&(REISERFS_I(inode)->i_mmap));
 	reiserfs_init_xattr_rwsem(inode);
 
 	if (stat_data_v1(ih)) {
@@ -1221,7 +1227,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V2_SIZE));
-		/* read persistent inode attributes from sd and initalise
+		/* read persistent inode attributes from sd and initialise
 		   generic inode flags from them */
 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
@@ -1841,7 +1847,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	REISERFS_I(inode)->i_attrs =
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-	mutex_init(&(REISERFS_I(inode)->i_mmap));
 	reiserfs_init_xattr_rwsem(inode);
 
 	/* key to search for correct place for new stat data */
@@ -2027,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
 	/* start within the page of the last block in the file */
 	start = (offset / blocksize) * blocksize;
 
-	error = block_prepare_write(page, start, offset,
+	error = __block_write_begin(page, start, offset - start,
 				    reiserfs_get_block_create_0);
 	if (error)
 		goto unlock;
@@ -2432,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
 		/* from this point on, we know the buffer is mapped to a
 		 * real block and not a direct item
 		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+		if (wbc->sync_mode != WB_SYNC_NONE) {
 			lock_buffer(bh);
 		} else {
 			if (!trylock_buffer(bh)) {
@@ -2587,8 +2592,7 @@ static int reiserfs_write_begin(struct file *file,
 		old_ref = th->t_refcount;
 		th->t_refcount++;
 	}
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				reiserfs_get_block);
+	ret = __block_write_begin(page, pos, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
@@ -2623,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
 	return ret;
 }
 
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to)
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
@@ -2645,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 		th->t_refcount++;
 	}
 
-	ret = block_prepare_write(page, from, to, reiserfs_get_block);
+	ret = __block_write_begin(page, from, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
@@ -3059,10 +3062,25 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs,
 				  reiserfs_get_blocks_direct_io, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3072,6 +3090,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	int depth;
 	int error;
 
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
 	/* must be turned off for recursive notify_change calls */
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 
@@ -3121,55 +3143,58 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
-	error = inode_change_ok(inode, attr);
-	if (!error) {
-		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-			error = reiserfs_chown_xattrs(inode, attr);
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		struct reiserfs_transaction_handle th;
+		int jbegin_count =
+		    2 *
+		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+		    2;
 
-			if (!error) {
-				struct reiserfs_transaction_handle th;
-				int jbegin_count =
-				    2 *
-				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
-				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
-				    2;
-
-				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
-				error =
-				    journal_begin(&th, inode->i_sb,
-						  jbegin_count);
-				if (error)
-					goto out;
-				error = dquot_transfer(inode, attr);
-				if (error) {
-					journal_end(&th, inode->i_sb,
-						    jbegin_count);
-					goto out;
-				}
-				/* Update corresponding info in inode so that everything is in
-				 * one transaction */
-				if (attr->ia_valid & ATTR_UID)
-					inode->i_uid = attr->ia_uid;
-				if (attr->ia_valid & ATTR_GID)
-					inode->i_gid = attr->ia_gid;
-				mark_inode_dirty(inode);
-				error =
-				    journal_end(&th, inode->i_sb, jbegin_count);
-			}
-		}
-		if (!error) {
-			/*
-			 * Relax the lock here, as it might truncate the
-			 * inode pages and wait for inode pages locks.
-			 * To release such page lock, the owner needs the
-			 * reiserfs lock
-			 */
-			reiserfs_write_unlock_once(inode->i_sb, depth);
-			error = inode_setattr(inode, attr);
-			depth = reiserfs_write_lock_once(inode->i_sb);
+		error = reiserfs_chown_xattrs(inode, attr);
+
+		if (error)
+			return error;
+
+		/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+		error = journal_begin(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			journal_end(&th, inode->i_sb, jbegin_count);
+			goto out;
 		}
+
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		mark_inode_dirty(inode);
+		error = journal_end(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Relax the lock here, as it might truncate the
+	 * inode pages and wait for inode pages locks.
+	 * To release such page lock, the owner needs the
+	 * reiserfs lock
+	 */
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		error = vmtruncate(inode, attr->ia_size);
+
+	if (!error) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
 	}
+	depth = reiserfs_write_lock_once(inode->i_sb);
 
 	if (!error && reiserfs_posixacl(inode->i_sb)) {
 		if (attr->ia_valid & ATTR_MODE)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de071..adf22b485ce 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -170,6 +168,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 int reiserfs_unpack(struct inode *inode, struct file *filp)
 {
 	int retval = 0;
+	int depth;
 	int index;
 	struct page *page;
 	struct address_space *mapping;
@@ -188,8 +187,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	/* we need to make sure nobody is changing the file size beneath
 	 ** us
 	 */
-	mutex_lock(&inode->i_mutex);
-	reiserfs_write_lock(inode->i_sb);
+	reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+	depth = reiserfs_write_lock_once(inode->i_sb);
 
 	write_from = inode->i_size & (blocksize - 1);
 	/* if we are on a block boundary, we are already unpacked.  */
@@ -199,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	}
 
 	/* we unpack by finding the page with the tail, and calling
-	 ** reiserfs_prepare_write on that page.  This will force a
+	 ** __reiserfs_write_begin on that page.  This will force a
 	 ** reiserfs_get_block to unpack the tail for us.
 	 */
 	index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -209,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	if (!page) {
 		goto out;
 	}
-	retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+	retval = __reiserfs_write_begin(page, write_from, 0);
 	if (retval)
 		goto out_unlock;
 
@@ -224,6 +223,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 
       out:
 	mutex_unlock(&inode->i_mutex);
-	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_write_unlock_once(inode->i_sb, depth);
 	return retval;
 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e..076c8b19468 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
 	return 0;
 }
 
-static void disable_barrier(struct super_block *s)
-{
-	REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-	printk("reiserfs: disabling flush barriers on %s\n",
-	       reiserfs_bdevname(s));
-}
-
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 							 *sb)
 {
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
 	submit_bh(WRITE, bh);
 }
 
-static int submit_barrier_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_ordered_io;
-	clear_buffer_dirty(bh);
-	if (!buffer_uptodate(bh))
-		BUG();
-	return submit_bh(WRITE_BARRIER, bh);
-}
-
-static void check_barrier_completion(struct super_block *s,
-				     struct buffer_head *bh)
-{
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		disable_barrier(s);
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
-		reiserfs_write_unlock(s);
-		sync_dirty_buffer(bh);
-		reiserfs_write_lock(s);
-	}
-}
-
 #define CHUNK_SIZE 32
 struct buffer_chunk {
 	struct buffer_head *bh[CHUNK_SIZE];
@@ -983,7 +952,6 @@ static int flush_older_commits(struct super_block *s,
 
 static int reiserfs_async_progress_wait(struct super_block *s)
 {
-	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
 
 	if (atomic_read(&j->j_async_throttle)) {
@@ -1010,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
 	struct buffer_head *tbh = NULL;
 	unsigned int trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int barrier = 0;
 	int retval = 0;
 	int write_len;
 
@@ -1095,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
 	}
 	atomic_dec(&journal->j_async_throttle);
 
-	/* We're skipping the commit if there's an error */
-	if (retval || reiserfs_is_journal_aborted(journal))
-		barrier = 0;
-
-	/* wait on everything written so far before writing the commit
-	 * if we are in barrier mode, send the commit down now
-	 */
-	barrier = reiserfs_barrier_flush(s);
-	if (barrier) {
-		int ret;
-		lock_buffer(jl->j_commit_bh);
-		ret = submit_barrier_buffer(jl->j_commit_bh);
-		if (ret == -EOPNOTSUPP) {
-			set_buffer_uptodate(jl->j_commit_bh);
-			disable_barrier(s);
-			barrier = 0;
-		}
-	}
 	for (i = 0; i < (jl->j_len + 1); i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1144,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
 
 	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
 
-	if (!barrier) {
-		/* If there was a write error in the journal - we can't commit
-		 * this transaction - it will be invalid and, if successful,
-		 * will just end up propagating the write error out to
-		 * the file system. */
-		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-			if (buffer_dirty(jl->j_commit_bh))
-				BUG();
-			mark_buffer_dirty(jl->j_commit_bh) ;
-			reiserfs_write_unlock(s);
-			sync_dirty_buffer(jl->j_commit_bh) ;
-			reiserfs_write_lock(s);
-		}
-	} else {
+	/* If there was a write error in the journal - we can't commit
+	 * this transaction - it will be invalid and, if successful,
+	 * will just end up propagating the write error out to
+	 * the file system. */
+	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh) ;
 		reiserfs_write_unlock(s);
-		wait_on_buffer(jl->j_commit_bh);
+		if (reiserfs_barrier_flush(s))
+			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock(s);
 	}
 
-	check_barrier_completion(s, jl->j_commit_bh);
-
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
 	 * up propagating the write error out to the filesystem. */
@@ -1320,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 
-		if (reiserfs_barrier_flush(sb)) {
-			int ret;
-			lock_buffer(journal->j_header_bh);
-			ret = submit_barrier_buffer(journal->j_header_bh);
-			if (ret == -EOPNOTSUPP) {
-				set_buffer_uptodate(journal->j_header_bh);
-				disable_barrier(sb);
-				goto sync;
-			}
-			reiserfs_write_unlock(sb);
-			wait_on_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-			check_barrier_completion(sb, journal->j_header_bh);
-		} else {
-		      sync:
-			set_buffer_dirty(journal->j_header_bh);
-			reiserfs_write_unlock(sb);
+		set_buffer_dirty(journal->j_header_bh);
+		reiserfs_write_unlock(sb);
+
+		if (reiserfs_barrier_flush(sb))
+			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+		else
 			sync_dirty_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-		}
+
+		reiserfs_write_lock(sb);
 		if (!buffer_uptodate(journal->j_header_bh)) {
 			reiserfs_warning(sb, "journal-837",
 					 "IO error during journal replay");
@@ -2312,7 +2245,7 @@ static int journal_read_transaction(struct super_block *sb,
 	/* flush out the real blocks */
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		set_buffer_dirty(real_blocks[i]);
-		ll_rw_block(SWRITE, 1, real_blocks + i);
+		write_dirty_buffer(real_blocks[i], WRITE);
 	}
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086..ba5f51ec345 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	inode->i_ctime = CURRENT_TIME_SEC;
 	reiserfs_update_sd(&th, inode);
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 	reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9822fa15118..3bf7a6457f4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -525,6 +525,8 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
+	atomic_set(&ei->openers, 0);
+	mutex_init(&ei->tailpack);
 	return &ei->vfs_inode;
 }
 
@@ -589,11 +591,6 @@ out:
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
-static void reiserfs_clear_inode(struct inode *inode)
-{
-	dquot_drop(inode);
-}
-
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 				    size_t, loff_t);
@@ -606,8 +603,7 @@ static const struct super_operations reiserfs_sops = {
 	.destroy_inode = reiserfs_destroy_inode,
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
-	.clear_inode = reiserfs_clear_inode,
-	.delete_inode = reiserfs_delete_inode,
+	.evict_inode = reiserfs_evict_inode,
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
@@ -2217,12 +2213,11 @@ out:
 
 #endif
 
-static int get_super_block(struct file_system_type *fs_type,
+static struct dentry *get_super_block(struct file_system_type *fs_type,
 			   int flags, const char *dev_name,
-			   void *data, struct vfsmount *mnt)
+			   void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
 }
 
 static int __init init_reiserfs_fs(void)
@@ -2257,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
 struct file_system_type reiserfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "reiserfs",
-	.get_sb = get_super_block,
+	.mount = get_super_block,
 	.kill_sb = reiserfs_kill_sb,
 	.fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c67..5d04a7828e7 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to);
 
 static void update_ctime(struct inode *inode)
 {
 	struct timespec now = current_fs_time(inode->i_sb);
-	if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+	if (inode_unhashed(inode) || !inode->i_nlink ||
 	    timespec_equal(&inode->i_ctime, &now))
 		return;
 
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 			rxh->h_hash = cpu_to_le32(xahash);
 		}
 
-		err = reiserfs_prepare_write(NULL, page, page_offset,
-					    page_offset + chunk + skip);
+		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
 		if (!err) {
 			if (buffer)
 				memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d21354689..6647f90e55c 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= romfs_readdir,
+	.llseek		= default_llseek,
 };
 
 static const struct inode_operations romfs_dir_inode_operations = {
@@ -551,20 +552,19 @@ error_rsb:
 /*
  * get a superblock for mounting
  */
-static int romfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
+			void *data)
 {
-	int ret = -EINVAL;
+	struct dentry *ret = ERR_PTR(-EINVAL);
 
 #ifdef CONFIG_ROMFS_ON_MTD
-	ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
-			 mnt);
+	ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-	if (ret == -EINVAL)
-		ret = get_sb_bdev(fs_type, flags, dev_name, data,
-				  romfs_fill_super, mnt);
+	if (ret == ERR_PTR(-EINVAL))
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super);
 #endif
 	return ret;
 }
@@ -591,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
 static struct file_system_type romfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "romfs",
-	.get_sb		= romfs_get_sb,
+	.mount		= romfs_mount,
 	.kill_sb	= romfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/select.c b/fs/select.c
index 500a669f779..b7b10aa3086 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
 	return slack;
 }
 
-static long estimate_accuracy(struct timespec *tv)
+long select_estimate_accuracy(struct timespec *tv)
 {
 	unsigned long ret;
 	struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	}
 
 	if (end_time && !timed_out)
-		slack = estimate_accuracy(end_time);
+		slack = select_estimate_accuracy(end_time);
 
 	retval = 0;
 	for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	}
 
 	if (end_time && !timed_out)
-		slack = estimate_accuracy(end_time);
+		slack = select_estimate_accuracy(end_time);
 
 	for (;;) {
 		struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3..05d6b0e78c9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
  */
 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	size_t copied = 0;
 	loff_t pos;
 	size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
  */
 loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	loff_t retval = -EINVAL;
 
 	mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
  */
 int seq_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct seq_file *m = file->private_data;
 	kfree(m->buf);
 	kfree(m);
 	return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
 	if (size) {
 		char *p;
 
-		spin_lock(&dcache_lock);
 		p = __d_path(path, root, buf, size);
-		spin_unlock(&dcache_lock);
 		res = PTR_ERR(p);
 		if (!IS_ERR(p)) {
 			char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c..492465b451d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
 		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
 		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
 		break;
 	case __SI_POLL:
 		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -98,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #ifdef __ARCH_SI_TRAPNO
 		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+		/* 
+		 * Other callers might not initialize the si_lsb field,
+		 * so check explicitly for the right codes here.
+		 */
+		if (kinfo->si_code == BUS_MCEERR_AR ||
+		    kinfo->si_code == BUS_MCEERR_AO)
+			err |= __put_user((short) kinfo->si_addr_lsb,
+					  &uinfo->ssi_addr_lsb);
+#endif
 		break;
 	case __SI_CHLD:
 		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -111,6 +122,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
 		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
 		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
 		break;
 	default:
 		/*
@@ -204,6 +216,7 @@ static const struct file_operations signalfd_fops = {
 	.release	= signalfd_release,
 	.poll		= signalfd_poll,
 	.read		= signalfd_read,
+	.llseek		= noop_llseek,
 };
 
 SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2..00000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-config SMB_FS
-	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-	depends on INET
-	select NLS
-	help
-	  SMB (Server Message Block) is the protocol Windows for Workgroups
-	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-	  files and printers over local networks.  Saying Y here allows you to
-	  mount their file systems (often called "shares" in this context) and
-	  access them just like any other Unix directory.  Currently, this
-	  works only if the Windows machines use TCP/IP as the underlying
-	  transport protocol, and not NetBEUI.  For details, read
-	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>.
-
-	  Note: if you just want your box to act as an SMB *server* and make
-	  files and printing services available to Windows clients (which need
-	  to have a TCP/IP stack), you don't need to say Y here; you can use
-	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-	  for that.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile the SMB support as a module, choose M here:
-	  the module will be called smbfs.  Most people say N, however.
-
-config SMB_NLS_DEFAULT
-	bool "Use a default NLS"
-	depends on SMB_FS
-	help
-	  Enabling this will make smbfs use nls translations by default. You
-	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-	  settings and you need to give the default nls for the SMB server as
-	  CONFIG_SMB_NLS_REMOTE.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
-config SMB_NLS_REMOTE
-	string "Default Remote NLS Option"
-	depends on SMB_NLS_DEFAULT
-	default "cp437"
-	help
-	  This setting allows you to specify a default value for which
-	  codepage the server uses. If this field is left blank no
-	  translations will be done by default. The local codepage/charset
-	  default to CONFIG_NLS_DEFAULT.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c..00000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-
-obj-$(CONFIG_SMB_FS) += smbfs.o
-
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-		symlink.o smbiod.o request.o
-
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
-
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e34..00000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-
-#include <asm/page.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-	struct smb_sb_info *server = server_from_inode(dir);
-	union  smb_dir_cache *cache = NULL;
-	struct page *page = NULL;
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto out;
-
-	if (!PageUptodate(page))
-		goto out_unlock;
-
-	cache = kmap(page);
-	cache->head.time = jiffies - SMB_MAX_AGE(server);
-
-	kunmap(page);
-	SetPageUptodate(page);
-out_unlock:
-	unlock_page(page);
-	page_cache_release(page);
-out:
-	return;
-}
-
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-	struct smb_sb_info *server = server_from_dentry(parent);
-	struct list_head *next;
-	struct dentry *dentry;
-
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_u.d_child);
-		dentry->d_fsdata = NULL;
-		smb_age_dentry(server, dentry);
-		next = next->next;
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-	struct dentry *dent = dentry;
-	struct list_head *next;
-
-	if (d_validate(dent, parent)) {
-		if (dent->d_name.len <= SMB_MAXNAMELEN &&
-		    (unsigned long)dent->d_fsdata == fpos) {
-			if (!dent->d_inode) {
-				dput(dent);
-				dent = NULL;
-			}
-			return dent;
-		}
-		dput(dent);
-	}
-
-	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
-	next = parent->d_subdirs.next;
-	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_u.d_child);
-		if ((unsigned long)dent->d_fsdata == fpos) {
-			if (dent->d_inode)
-				dget_locked(dent);
-			else
-				dent = NULL;
-			goto out_unlock;
-		}
-		next = next->next;
-	}
-	dent = NULL;
-out_unlock:
-	spin_unlock(&dcache_lock);
-	return dent;
-}
-
-
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	       struct smb_cache_control *ctrl, struct qstr *qname,
-	       struct smb_fattr *entry)
-{
-	struct dentry *newdent, *dentry = filp->f_path.dentry;
-	struct inode *newino, *inode = dentry->d_inode;
-	struct smb_cache_control ctl = *ctrl;
-	int valid = 0;
-	int hashed = 0;
-	ino_t ino = 0;
-
-	qname->hash = full_name_hash(qname->name, qname->len);
-
-	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, qname) != 0)
-			goto end_advance;
-
-	newdent = d_lookup(dentry, qname);
-
-	if (!newdent) {
-		newdent = d_alloc(dentry, qname);
-		if (!newdent)
-			goto end_advance;
-	} else {
-		hashed = 1;
-		memcpy((char *) newdent->d_name.name, qname->name,
-		       newdent->d_name.len);
-	}
-
-	if (!newdent->d_inode) {
-		smb_renew_times(newdent);
-		entry->f_ino = iunique(inode->i_sb, 2);
-		newino = smb_iget(inode->i_sb, entry);
-		if (newino) {
-			smb_new_dentry(newdent);
-			d_instantiate(newdent, newino);
-			if (!hashed)
-				d_rehash(newdent);
-		}
-	} else
-		smb_set_inode_attr(newdent->d_inode, entry);
-
-        if (newdent->d_inode) {
-		ino = newdent->d_inode->i_ino;
-		newdent->d_fsdata = (void *) ctl.fpos;
-		smb_new_dentry(newdent);
-	}
-
-	if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-		}
-		ctl.cache = NULL;
-		ctl.idx  -= SMB_DIRCACHE_SIZE;
-		ctl.ofs  += 1;
-		ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-		if (ctl.page)
-			ctl.cache = kmap(ctl.page);
-	}
-	if (ctl.cache) {
-		ctl.cache->dentry[ctl.idx] = newdent;
-		valid = 1;
-	}
-	dput(newdent);
-
-end_advance:
-	if (!valid)
-		ctl.valid = 0;
-	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-		if (!ino)
-			ino = find_inode_number(dentry, qname);
-		if (!ino)
-			ino = iunique(inode->i_sb, 2);
-		ctl.filled = filldir(dirent, qname->name, qname->len,
-				     filp->f_pos, ino, DT_UNKNOWN);
-		if (!ctl.filled)
-			filp->f_pos += 1;
-	}
-	ctl.fpos += 1;
-	ctl.idx  += 1;
-	*ctrl = ctl;
-	return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f3..00000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-		      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-
-const struct file_operations smb_dir_operations =
-{
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= smb_readdir,
-	.unlocked_ioctl	= smb_ioctl,
-	.open		= smb_dir_open,
-};
-
-const struct inode_operations smb_dir_inode_operations =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
-
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-	.create		= smb_create,
-	.lookup		= smb_lookup,
-	.unlink		= smb_unlink,
-	.mkdir		= smb_mkdir,
-	.rmdir		= smb_rmdir,
-	.rename		= smb_rename,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-	.symlink	= smb_symlink,
-	.mknod		= smb_make_node,
-	.link		= smb_link,
-};
-
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *dir = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	union  smb_dir_cache *cache = NULL;
-	struct smb_cache_control ctl;
-	struct page *page = NULL;
-	int result;
-
-	ctl.page  = NULL;
-	ctl.cache = NULL;
-
-	VERBOSE("reading %s/%s, f_pos=%d\n",
-		DENTRY_PATH(dentry),  (int) filp->f_pos);
-
-	result = 0;
-
-	lock_kernel();
-
-	switch ((unsigned int) filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 1;
-		/* fallthrough */
-	case 1:
-		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-			goto out;
-		filp->f_pos = 2;
-	}
-
-	/*
-	 * Make sure our inode is up-to-date.
-	 */
-	result = smb_revalidate_inode(dentry);
-	if (result)
-		goto out;
-
-
-	page = grab_cache_page(&dir->i_data, 0);
-	if (!page)
-		goto read_really;
-
-	ctl.cache = cache = kmap(page);
-	ctl.head  = cache->head;
-
-	if (!PageUptodate(page) || !ctl.head.eof) {
-		VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-			 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-		goto init_cache;
-	}
-
-	if (filp->f_pos == 2) {
-		if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-			goto init_cache;
-
-		/*
-		 * N.B. ncpfs checks mtime of dentry too here, we don't.
-		 *   1. common smb servers do not update mtime on dir changes
-		 *   2. it requires an extra smb request
-		 *      (revalidate has the same timeout as ctl.head.time)
-		 *
-		 * Instead smbfs invalidates its own cache on local changes
-		 * and remote changes are not seen until timeout.
-		 */
-	}
-
-	if (filp->f_pos > ctl.head.end)
-		goto finished;
-
-	ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-	ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-	ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-
-	for (;;) {
-		if (ctl.ofs != 0) {
-			ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-			if (!ctl.page)
-				goto invalid_cache;
-			ctl.cache = kmap(ctl.page);
-			if (!PageUptodate(ctl.page))
-				goto invalid_cache;
-		}
-		while (ctl.idx < SMB_DIRCACHE_SIZE) {
-			struct dentry *dent;
-			int res;
-
-			dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-					     dentry, filp->f_pos);
-			if (!dent)
-				goto invalid_cache;
-
-			res = filldir(dirent, dent->d_name.name,
-				      dent->d_name.len, filp->f_pos,
-				      dent->d_inode->i_ino, DT_UNKNOWN);
-			dput(dent);
-			if (res)
-				goto finished;
-			filp->f_pos += 1;
-			ctl.idx += 1;
-			if (filp->f_pos > ctl.head.end)
-				goto finished;
-		}
-		if (ctl.page) {
-			kunmap(ctl.page);
-			SetPageUptodate(ctl.page);
-			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
-			ctl.page = NULL;
-		}
-		ctl.idx  = 0;
-		ctl.ofs += 1;
-	}
-invalid_cache:
-	if (ctl.page) {
-		kunmap(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-		ctl.page = NULL;
-	}
-	ctl.cache = cache;
-init_cache:
-	smb_invalidate_dircache_entries(dentry);
-	ctl.head.time = jiffies;
-	ctl.head.eof = 0;
-	ctl.fpos = 2;
-	ctl.ofs = 0;
-	ctl.idx = SMB_DIRCACHE_START;
-	ctl.filled = 0;
-	ctl.valid  = 1;
-read_really:
-	result = server->ops->readdir(filp, dirent, filldir, &ctl);
-	if (result == -ERESTARTSYS && page)
-		ClearPageUptodate(page);
-	if (ctl.idx == -1)
-		goto invalid_cache;	/* retry */
-	ctl.head.end = ctl.fpos - 1;
-	ctl.head.eof = ctl.valid;
-finished:
-	if (page) {
-		cache->head = ctl.head;
-		kunmap(page);
-		if (result != -ERESTARTSYS)
-			SetPageUptodate(page);
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	if (ctl.page) {
-		kunmap(ctl.page);
-		SetPageUptodate(ctl.page);
-		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
-	}
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server;
-	int error = 0;
-
-	VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name);
-
-	/*
-	 * Directory timestamps in the core protocol aren't updated
-	 * when a file is added, so we give them a very short TTL.
-	 */
-	lock_kernel();
-	server = server_from_dentry(dentry);
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-		unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-		if (age > 2*HZ)
-			smb_invalid_dir_cache(dir);
-	}
-
-	/*
-	 * Note: in order to allow the smbmount process to open the
-	 * mount point, we only revalidate if the connection is valid or
-	 * if the process is trying to access something other than the root.
-	 */
-	if (server->state == CONN_VALID || !IS_ROOT(dentry))
-		error = smb_revalidate_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-
-static const struct dentry_operations smbfs_dentry_operations =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_hash		= smb_hash_dentry,
-	.d_compare	= smb_compare_dentry,
-	.d_delete	= smb_delete_dentry,
-};
-
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-	.d_revalidate	= smb_lookup_validate,
-	.d_delete	= smb_delete_dentry,
-};
-
-
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode * inode = dentry->d_inode;
-	unsigned long age = jiffies - dentry->d_time;
-	int valid;
-
-	/*
-	 * The default validation is based on dentry age:
-	 * we believe in dentries for a few seconds.  (But each
-	 * successful server lookup renews the timestamp.)
-	 */
-	valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-	if (!valid)
-		VERBOSE("%s/%s not valid, age=%lu\n", 
-			DENTRY_PATH(dentry), age);
-#endif
-
-	if (inode) {
-		lock_kernel();
-		if (is_bad_inode(inode)) {
-			PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-			valid = 0;
-		} else if (!valid)
-			valid = (smb_revalidate_inode(dentry) == 0);
-		unlock_kernel();
-	} else {
-		/*
-		 * What should we do for negative dentries?
-		 */
-	}
-	return valid;
-}
-
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-	unsigned long hash;
-	int i;
-
-	hash = init_name_hash();
-	for (i=0; i < this->len ; i++)
-		hash = partial_name_hash(tolower(this->name[i]), hash);
-	this->hash = end_name_hash(hash);
-  
-	return 0;
-}
-
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-	int i, result = 1;
-
-	if (a->len != b->len)
-		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
-			goto out;
-	}
-	result = 0;
-out:
-	return result;
-}
-
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-	if (dentry->d_inode) {
-		if (is_bad_inode(dentry->d_inode)) {
-			PARANOIA("bad inode, unhashing %s/%s\n",
-				 DENTRY_PATH(dentry));
-			return 1;
-		}
-	} else {
-		/* N.B. Unhash negative dentries? */
-	}
-	return 0;
-}
-
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-
-	if (server->mnt->flags & SMB_MOUNT_CASE)
-		dentry->d_op = &smbfs_dentry_operations_case;
-	else
-		dentry->d_op = &smbfs_dentry_operations;
-	dentry->d_time = jiffies;
-}
-
-
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-	dget(dentry);
-	spin_lock(&dentry->d_lock);
-	for (;;) {
-		struct dentry *parent;
-
-		dentry->d_time = jiffies;
-		if (IS_ROOT(dentry))
-			break;
-		parent = dentry->d_parent;
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		dput(dentry);
-		dentry = parent;
-		spin_lock(&dentry->d_lock);
-	}
-	spin_unlock(&dentry->d_lock);
-	dput(dentry);
-}
-
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	struct smb_fattr finfo;
-	struct inode *inode;
-	int error;
-	struct smb_sb_info *server;
-
-	error = -ENAMETOOLONG;
-	if (dentry->d_name.len > SMB_MAXNAMELEN)
-		goto out;
-
-	/* Do not allow lookup of names with backslashes in */
-	error = -EINVAL;
-	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-		goto out;
-
-	lock_kernel();
-	error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-	if (error && error != -ENOENT)
-		PARANOIA("find %s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-#endif
-
-	inode = NULL;
-	if (error == -ENOENT)
-		goto add_entry;
-	if (!error) {
-		error = -EACCES;
-		finfo.f_ino = iunique(dentry->d_sb, 2);
-		inode = smb_iget(dir->i_sb, &finfo);
-		if (inode) {
-	add_entry:
-			server = server_from_dentry(dentry);
-			if (server->mnt->flags & SMB_MOUNT_CASE)
-				dentry->d_op = &smbfs_dentry_operations_case;
-			else
-				dentry->d_op = &smbfs_dentry_operations;
-
-			d_add(dentry, inode);
-			smb_renew_times(dentry);
-			error = 0;
-		}
-	}
-	unlock_kernel();
-out:
-	return ERR_PTR(error);
-}
-
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode;
-	int error;
-	struct smb_fattr fattr;
-
-	VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (error)
-		goto out_close;
-
-	smb_renew_times(dentry);
-	fattr.f_ino = iunique(dentry->d_sb, 2);
-	inode = smb_iget(dentry->d_sb, &fattr);
-	if (!inode)
-		goto out_no_inode;
-
-	if (have_id) {
-		struct smb_inode_info *ei = SMB_I(inode);
-		ei->fileid = fileid;
-		ei->access = SMB_O_RDWR;
-		ei->open = server->generation;
-	}
-	d_instantiate(dentry, inode);
-out:
-	return error;
-
-out_no_inode:
-	error = -EACCES;
-out_close:
-	if (have_id) {
-		PARANOIA("%s/%s failed, error=%d, closing %u\n",
-			 DENTRY_PATH(dentry), error, fileid);
-		smb_close_fileid(dentry, fileid);
-	}
-	goto out;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	__u16 fileid;
-	int error;
-	struct iattr attr;
-
-	VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new file */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, fileid, 1);
-	} else {
-		PARANOIA("%s/%s failed, error=%d\n",
-			 DENTRY_PATH(dentry), error);
-	}
-	unlock_kernel();
-	return error;
-}
-
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int error;
-	struct iattr attr;
-
-	lock_kernel();
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_mkdir(dentry);
-	if (!error) {
-		if (server->opt.capabilities & SMB_CAP_UNIX) {
-			/* Set attributes for new directory */
-			attr.ia_valid = ATTR_MODE;
-			attr.ia_mode = mode;
-			error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-		}
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-
-	/*
-	 * Close the directory if it's open.
-	 */
-	lock_kernel();
-	smb_close(inode);
-
-	/*
-	 * Check that nobody else is using the directory..
-	 */
-	error = -EBUSY;
-	if (!d_unhashed(dentry))
-		goto out;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_rmdir(dentry);
-
-out:
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	/*
-	 * Close the file if it's open.
-	 */
-	lock_kernel();
-	smb_close(dentry->d_inode);
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_unlink(dentry);
-	if (!error)
-		smb_renew_times(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-	   struct inode *new_dir, struct dentry *new_dentry)
-{
-	int error;
-
-	/*
-	 * Close any open files, and check whether to delete the
-	 * target before attempting the rename.
-	 */
-	lock_kernel();
-	if (old_dentry->d_inode)
-		smb_close(old_dentry->d_inode);
-	if (new_dentry->d_inode) {
-		smb_close(new_dentry->d_inode);
-		error = smb_proc_unlink(new_dentry);
-		if (error) {
-			VERBOSE("unlink %s/%s, error=%d\n",
-				DENTRY_PATH(new_dentry), error);
-			goto out;
-		}
-		/* FIXME */
-		d_delete(new_dentry);
-	}
-
-	smb_invalid_dir_cache(old_dir);
-	smb_invalid_dir_cache(new_dir);
-	error = smb_proc_mv(old_dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(old_dentry);
-		smb_renew_times(new_dentry);
-	}
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-	int error;
-	struct iattr attr;
-
-	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-	attr.ia_mode = mode;
-	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-
-	if (!new_valid_dev(dev))
-		return -EINVAL;
-
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-	if (!error) {
-		error = smb_instantiate(dentry, 0, 0);
-	}
-	return error;
-}
-
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-	int error;
-
-	DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-	       DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-	smb_invalid_dir_cache(dir);
-	error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-	if (!error) {
-		smb_renew_times(dentry);
-		error = smb_instantiate(new_dentry, 0, 0);
-	}
-	return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94b..00000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-static int
-smb_fsync(struct file *file, int datasync)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-
-	/*
-	 * The VFS will writepage() all dirty pages for us, but we
-	 * should send a SMBflush to the server, letting it know that
-	 * we want things synchronized with actual storage.
-	 *
-	 * Note: this function requires all pages to have been written already
-	 *       (should be ok with writepage_sync)
-	 */
-	result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-	return result;
-}
-
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-	char *buffer = kmap(page);
-	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int rsize = smb_get_rsize(server);
-	int count = PAGE_SIZE;
-	int result;
-
-	VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-		DENTRY_PATH(dentry), count, offset, rsize);
-
-	result = smb_open(dentry, SMB_O_RDONLY);
-	if (result < 0)
-		goto io_error;
-
-	do {
-		if (count < rsize)
-			rsize = count;
-
-		result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-		if (result < 0)
-			goto io_error;
-
-		count -= result;
-		offset += result;
-		buffer += result;
-		dentry->d_inode->i_atime =
-			current_fs_time(dentry->d_inode->i_sb);
-		if (result < rsize)
-			break;
-	} while (count);
-
-	memset(buffer, 0, count);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	result = 0;
-
-io_error:
-	kunmap(page);
-	unlock_page(page);
-	return result;
-}
-
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-	int		error;
-	struct dentry  *dentry = file->f_path.dentry;
-
-	page_cache_get(page);
-	error = smb_readpage_sync(dentry, page);
-	page_cache_release(page);
-	return error;
-}
-
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-		   unsigned long pageoffset, unsigned int count)
-{
-	loff_t offset;
-	char *buffer = kmap(page) + pageoffset;
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned int wsize = smb_get_wsize(server);
-	int ret = 0;
-
-	offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-	VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-
-	do {
-		int write_ret;
-
-		if (count < wsize)
-			wsize = count;
-
-		write_ret = server->ops->write(inode, offset, wsize, buffer);
-		if (write_ret < 0) {
-			PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-			ret = write_ret;
-			break;
-		}
-		/* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-		if (write_ret < wsize)
-			PARANOIA("short write, wsize=%d, write_ret=%d\n",
-				 wsize, write_ret);
-#endif
-		buffer += wsize;
-		offset += wsize;
-		count -= wsize;
-		/*
-		 * Update the inode now rather than waiting for a refresh.
-		 */
-		inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-		SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-		if (offset > inode->i_size)
-			inode->i_size = offset;
-	} while (count);
-
-	kunmap(page);
-	return ret;
-}
-
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode;
-	unsigned long end_index;
-	unsigned offset = PAGE_CACHE_SIZE;
-	int err;
-
-	BUG_ON(!mapping);
-	inode = mapping->host;
-	BUG_ON(!inode);
-
-	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-
-	/* easy case */
-	if (page->index < end_index)
-		goto do_it;
-	/* things got complicated... */
-	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-	/* OK, are we completely out? */
-	if (page->index >= end_index+1 || !offset)
-		return 0; /* truncated - don't care */
-do_it:
-	page_cache_get(page);
-	err = smb_writepage_sync(inode, page, 0, offset);
-	SetPageUptodate(page);
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-	       unsigned int count)
-{
-	struct dentry *dentry = file->f_path.dentry;
-
-	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-
-	return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	status;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-
-	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-		(long)dentry->d_inode->i_size,
-		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-
-	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-	return status;
-}
-
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	struct dentry * dentry = file->f_path.dentry;
-	int	status;
-
-	VERBOSE("file %s/%s, address %lu - %lu\n",
-		DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%d\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_mmap(file, vma);
-out:
-	return status;
-}
-
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t count,
-		     unsigned int flags)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	ssize_t status;
-
-	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-		DENTRY_PATH(dentry), *ppos, count);
-
-	status = smb_revalidate_inode(dentry);
-	if (status) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), status);
-		goto out;
-	}
-	status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-	return status;
-}
-
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
-{
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	*pagep = grab_cache_page_write_begin(mapping, index, flags);
-	if (!*pagep)
-		return -ENOMEM;
-	return 0;
-}
-
-static int smb_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	int status;
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-
-	lock_kernel();
-	status = smb_updatepage(file, page, offset, copied);
-	unlock_kernel();
-
-	if (!status) {
-		if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-			SetPageUptodate(page);
-		status = copied;
-	}
-
-	unlock_page(page);
-	page_cache_release(page);
-
-	return status;
-}
-
-const struct address_space_operations smb_file_aops = {
-	.readpage = smb_readpage,
-	.writepage = smb_writepage,
-	.write_begin = smb_write_begin,
-	.write_end = smb_write_end,
-};
-
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
-{
-	struct file * file = iocb->ki_filp;
-	struct dentry * dentry = file->f_path.dentry;
-	ssize_t	result;
-
-	VERBOSE("file %s/%s, count=%lu@%lu\n",
-		DENTRY_PATH(dentry),
-		(unsigned long) iocb->ki_left, (unsigned long) pos);
-
-	result = smb_revalidate_inode(dentry);
-	if (result) {
-		PARANOIA("%s/%s validation failed, error=%Zd\n",
-			 DENTRY_PATH(dentry), result);
-		goto out;
-	}
-
-	result = smb_open(dentry, SMB_O_WRONLY);
-	if (result)
-		goto out;
-
-	if (iocb->ki_left > 0) {
-		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-			(long) file->f_pos, (long) dentry->d_inode->i_size,
-			dentry->d_inode->i_mtime.tv_sec,
-			dentry->d_inode->i_atime.tv_sec);
-	}
-out:
-	return result;
-}
-
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-	int result;
-	struct dentry *dentry = file->f_path.dentry;
-	int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-
-	lock_kernel();
-	result = smb_open(dentry, smb_mode);
-	if (result)
-		goto out;
-	SMB_I(inode)->openers++;
-out:
-	unlock_kernel();
-	return result;
-}
-
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-	lock_kernel();
-	if (!--SMB_I(inode)->openers) {
-		/* We must flush any dirty pages now as we won't be able to
-		   write anything after close. mmap can trigger this.
-		   "openers" should perhaps include mmap'ers ... */
-		filemap_write_and_wait(inode->i_mapping);
-		smb_close(inode);
-	}
-	unlock_kernel();
-	return 0;
-}
-
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-	int mode = inode->i_mode;
-	int error = 0;
-
-	VERBOSE("mode=%x, mask=%x\n", mode, mask);
-
-	/* Look at user permissions */
-	mode >>= 6;
-	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-		error = -EACCES;
-	return error;
-}
-
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t ret;
-	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
-	unlock_kernel();
-	return ret;
-}
-
-const struct file_operations smb_file_operations =
-{
-	.llseek 	= smb_remote_llseek,
-	.read		= do_sync_read,
-	.aio_read	= smb_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= smb_file_aio_write,
-	.unlocked_ioctl	= smb_ioctl,
-	.mmap		= smb_file_mmap,
-	.open		= smb_file_open,
-	.release	= smb_file_release,
-	.fsync		= smb_fsync,
-	.splice_read	= smb_file_splice_read,
-};
-
-const struct inode_operations smb_file_inode_operations =
-{
-	.permission	= smb_file_permission,
-	.getattr	= smb_getattr,
-	.setattr	= smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab..00000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-
-#include "getopt.h"
-
-/**
- *	smb_getopt - option parser
- *	@caller: name of the caller, for error messages
- *	@options: the options string
- *	@opts: an array of &struct option entries controlling parser operations
- *	@optopt: output; will contain the current option
- *	@optarg: output; will contain the value (if one exists)
- *	@flag: output; may be NULL; should point to a long for or'ing flags
- *	@value: output; may be NULL; will be overwritten with the integer value
- *		of the current argument.
- *
- *	Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *	Returns opts->val if a matching entry in the 'opts' array is found,
- *	0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-	       char **optopt, char **optarg, unsigned long *flag,
-	       unsigned long *value)
-{
-	char *token;
-	char *val;
-	int i;
-
-	do {
-		if ((token = strsep(options, ",")) == NULL)
-			return 0;
-	} while (*token == '\0');
-	*optopt = token;
-
-	*optarg = NULL;
-	if ((val = strchr (token, '=')) != NULL) {
-		*val++ = 0;
-		if (value)
-			*value = simple_strtoul(val, NULL, 0);
-		*optarg = val;
-	}
-
-	for (i = 0; opts[i].name != NULL; i++) {
-		if (!strcmp(opts[i].name, token)) {
-			if (!opts[i].flag && (!val || !*val)) {
-				printk("%s: the %s option requires an argument\n",
-				       caller, token);
-				return -1;
-			}
-
-			if (flag && opts[i].flag)
-				*flag |= opts[i].flag;
-
-			return opts[i].val;
-		}
-	}
-	printk("%s: Unrecognized mount option %s\n", caller, token);
-	return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c4..00000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-
-struct option {
-	const char *name;
-	unsigned long flag;
-	int val;
-};
-
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-		      char **optopt, char **optarg, unsigned long *flag,
-		      unsigned long *value);
-
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 9551cb6f7fe..00000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-
-#define SMB_TTL_DEFAULT 1000
-
-static void smb_delete_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-
-static struct kmem_cache *smb_inode_cachep;
-
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-	struct smb_inode_info *ei;
-	ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void smb_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-					     sizeof(struct smb_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (smb_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(smb_inode_cachep);
-}
-
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_NODIRATIME;
-	return 0;
-}
-
-static const struct super_operations smb_sops =
-{
-	.alloc_inode	= smb_alloc_inode,
-	.destroy_inode	= smb_destroy_inode,
-	.drop_inode	= generic_delete_inode,
-	.delete_inode	= smb_delete_inode,
-	.put_super	= smb_put_super,
-	.statfs		= smb_statfs,
-	.show_options	= smb_show_options,
-	.remount_fs	= smb_remount,
-};
-
-
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-	struct inode *result;
-
-	DEBUG1("smb_iget: %p\n", fattr);
-
-	result = new_inode(sb);
-	if (!result)
-		return result;
-	result->i_ino = fattr->f_ino;
-	SMB_I(result)->open = 0;
-	SMB_I(result)->fileid = 0;
-	SMB_I(result)->access = 0;
-	SMB_I(result)->flags = 0;
-	SMB_I(result)->closed = 0;
-	SMB_I(result)->openers = 0;
-	smb_set_inode_attr(result, fattr);
-	if (S_ISREG(result->i_mode)) {
-		result->i_op = &smb_file_inode_operations;
-		result->i_fop = &smb_file_operations;
-		result->i_data.a_ops = &smb_file_aops;
-	} else if (S_ISDIR(result->i_mode)) {
-		if (server->opt.capabilities & SMB_CAP_UNIX)
-			result->i_op = &smb_dir_inode_operations_unix;
-		else
-			result->i_op = &smb_dir_inode_operations;
-		result->i_fop = &smb_dir_operations;
-	} else if (S_ISLNK(result->i_mode)) {
-		result->i_op = &smb_link_inode_operations;
-	} else {
-		init_special_inode(result, result->i_mode, fattr->f_rdev);
-	}
-	insert_inode_hash(result);
-	return result;
-}
-
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(struct smb_fattr));
-	fattr->f_mode	= inode->i_mode;
-	fattr->f_nlink	= inode->i_nlink;
-	fattr->f_ino	= inode->i_ino;
-	fattr->f_uid	= inode->i_uid;
-	fattr->f_gid	= inode->i_gid;
-	fattr->f_size	= inode->i_size;
-	fattr->f_mtime	= inode->i_mtime;
-	fattr->f_ctime	= inode->i_ctime;
-	fattr->f_atime	= inode->i_atime;
-	fattr->f_blocks	= inode->i_blocks;
-
-	fattr->attr	= SMB_I(inode)->attr;
-	/*
-	 * Keep the attributes in sync with the inode permissions.
-	 */
-	if (fattr->f_mode & S_IWUSR)
-		fattr->attr &= ~aRONLY;
-	else
-		fattr->attr |= aRONLY;
-}
-
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-	struct smb_inode_info *ei = SMB_I(inode);
-
-	/*
-	 * A size change should have a different mtime, or same mtime
-	 * but different size.
-	 */
-	time_t last_time = inode->i_mtime.tv_sec;
-	loff_t last_sz = inode->i_size;
-
-	inode->i_mode	= fattr->f_mode;
-	inode->i_nlink	= fattr->f_nlink;
-	inode->i_uid	= fattr->f_uid;
-	inode->i_gid	= fattr->f_gid;
-	inode->i_ctime	= fattr->f_ctime;
-	inode->i_blocks = fattr->f_blocks;
-	inode->i_size	= fattr->f_size;
-	inode->i_mtime	= fattr->f_mtime;
-	inode->i_atime	= fattr->f_atime;
-	ei->attr = fattr->attr;
-
-	/*
-	 * Update the "last time refreshed" field for revalidation.
-	 */
-	ei->oldmtime = jiffies;
-
-	if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-		VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-			inode->i_ino,
-			(long) last_time, (long) inode->i_mtime.tv_sec,
-			(long) last_sz, (long) inode->i_size);
-
-		if (!S_ISDIR(inode->i_mode))
-			invalidate_remote_inode(inode);
-	}
-}
-
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-	VERBOSE("\n");
-	shrink_dcache_sb(SB_of(server));
-	invalidate_inodes(SB_of(server));
-}
-
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int error;
-	struct smb_fattr fattr;
-
-	error = smb_proc_getattr(dentry, &fattr);
-	if (!error) {
-		smb_renew_times(dentry);
-		/*
-		 * Check whether the type part of the mode changed,
-		 * and don't update the attributes if it did.
-		 *
-		 * And don't dick with the root inode
-		 */
-		if (inode->i_ino == 2)
-			return error;
-		if (S_ISLNK(inode->i_mode))
-			return error;	/* VFS will deal with it */
-
-		if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-			smb_set_inode_attr(inode, &fattr);
-		} else {
-			/*
-			 * Big trouble! The inode has become a new object,
-			 * so any operations attempted on it are invalid.
-			 *
-			 * To limit damage, mark the inode as bad so that
-			 * subsequent lookup validations will fail.
-			 */
-			PARANOIA("%s/%s changed mode, %07o to %07o\n",
-				 DENTRY_PATH(dentry),
-				 inode->i_mode, fattr.f_mode);
-
-			fattr.f_mode = inode->i_mode; /* save mode */
-			make_bad_inode(inode);
-			inode->i_mode = fattr.f_mode; /* restore mode */
-			/*
-			 * No need to worry about unhashing the dentry: the
-			 * lookup validation will see that the inode is bad.
-			 * But we do want to invalidate the caches ...
-			 */
-			if (!S_ISDIR(inode->i_mode))
-				invalidate_remote_inode(inode);
-			else
-				smb_invalid_dir_cache(inode);
-			error = -EIO;
-		}
-	}
-	return error;
-}
-
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-	struct smb_sb_info *s = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int error = 0;
-
-	DEBUG1("smb_revalidate_inode\n");
-	lock_kernel();
-
-	/*
-	 * Check whether we've recently refreshed the inode.
-	 */
-	if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-		VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-			inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-		goto out;
-	}
-
-	error = smb_refresh_inode(dentry);
-out:
-	unlock_kernel();
-	return error;
-}
-
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_delete_inode(struct inode *ino)
-{
-	DEBUG1("ino=%ld\n", ino->i_ino);
-	truncate_inode_pages(&ino->i_data, 0);
-	lock_kernel();
-	if (smb_close(ino))
-		PARANOIA("could not close inode %ld\n", ino->i_ino);
-	unlock_kernel();
-	clear_inode(ino);
-}
-
-static struct option opts[] = {
-	{ "version",	0, 'v' },
-	{ "win95",	SMB_MOUNT_WIN95, 1 },
-	{ "oldattr",	SMB_MOUNT_OLDATTR, 1 },
-	{ "dirattr",	SMB_MOUNT_DIRATTR, 1 },
-	{ "case",	SMB_MOUNT_CASE, 1 },
-	{ "uid",	0, 'u' },
-	{ "gid",	0, 'g' },
-	{ "file_mode",	0, 'f' },
-	{ "dir_mode",	0, 'd' },
-	{ "iocharset",	0, 'i' },
-	{ "codepage",	0, 'c' },
-	{ "ttl",	0, 't' },
-	{ NULL,		0, 0}
-};
-
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-	int c;
-	unsigned long flags;
-	unsigned long value;
-	char *optarg;
-	char *optopt;
-
-	flags = 0;
-	while ( (c = smb_getopt("smbfs", &options, opts,
-				&optopt, &optarg, &flags, &value)) > 0) {
-
-		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-		switch (c) {
-		case 1:
-			/* got a "flag" option */
-			break;
-		case 'v':
-			if (value != SMB_MOUNT_VERSION) {
-			printk ("smbfs: Bad mount version %ld, expected %d\n",
-				value, SMB_MOUNT_VERSION);
-				return 0;
-			}
-			mnt->version = value;
-			break;
-		case 'u':
-			mnt->uid = value;
-			flags |= SMB_MOUNT_UID;
-			break;
-		case 'g':
-			mnt->gid = value;
-			flags |= SMB_MOUNT_GID;
-			break;
-		case 'f':
-			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-			flags |= SMB_MOUNT_FMODE;
-			break;
-		case 'd':
-			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-			flags |= SMB_MOUNT_DMODE;
-			break;
-		case 'i':
-			strlcpy(mnt->codepage.local_name, optarg, 
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 'c':
-			strlcpy(mnt->codepage.remote_name, optarg,
-				SMB_NLS_MAXNAMELEN);
-			break;
-		case 't':
-			mnt->ttl = value;
-			break;
-		default:
-			printk ("smbfs: Unrecognized mount option %s\n",
-				optopt);
-			return -1;
-		}
-	}
-	mnt->flags = flags;
-	return c;
-}
-
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-	struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-	int i;
-
-	for (i = 0; opts[i].name != NULL; i++)
-		if (mnt->flags & opts[i].flag)
-			seq_printf(s, ",%s", opts[i].name);
-
-	if (mnt->flags & SMB_MOUNT_UID)
-		seq_printf(s, ",uid=%d", mnt->uid);
-	if (mnt->flags & SMB_MOUNT_GID)
-		seq_printf(s, ",gid=%d", mnt->gid);
-	if (mnt->mounted_uid != 0)
-		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-
-	/* 
-	 * Defaults for file_mode and dir_mode are unknown to us; they
-	 * depend on the current umask of the user doing the mount.
-	 */
-	if (mnt->flags & SMB_MOUNT_FMODE)
-		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-	if (mnt->flags & SMB_MOUNT_DMODE)
-		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-
-	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-	if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-		seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-
-	if (mnt->ttl != SMB_TTL_DEFAULT)
-		seq_printf(s, ",ttl=%d", mnt->ttl);
-
-	return 0;
-}
-
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-	unload_nls(server->remote_nls);
-	unload_nls(server->local_nls);
-}
-
-static void
-smb_put_super(struct super_block *sb)
-{
-	struct smb_sb_info *server = SMB_SB(sb);
-
-	lock_kernel();
-
-	smb_lock_server(server);
-	server->state = CONN_INVALID;
-	smbiod_unregister_server(server);
-
-	smb_close_socket(server);
-
-	if (server->conn_pid)
-		kill_pid(server->conn_pid, SIGTERM, 1);
-
-	bdi_destroy(&server->bdi);
-	kfree(server->ops);
-	smb_unload_nls(server);
-	sb->s_fs_info = NULL;
-	smb_unlock_server(server);
-	put_pid(server->conn_pid);
-	kfree(server);
-
-	unlock_kernel();
-}
-
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-	struct smb_sb_info *server;
-	struct smb_mount_data_kernel *mnt;
-	struct smb_mount_data *oldmnt;
-	struct inode *root_inode;
-	struct smb_fattr root;
-	int ver;
-	void *mem;
-	static int warn_count;
-
-	if (warn_count < 5) {
-		warn_count++;
-		printk(KERN_EMERG "smbfs is deprecated and will be removed"
-			" from the 2.6.27 kernel. Please migrate to cifs\n");
-	}
-
-	if (!raw_data)
-		goto out_no_data;
-
-	oldmnt = (struct smb_mount_data *) raw_data;
-	ver = oldmnt->version;
-	if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-		goto out_wrong_data;
-
-	sb->s_flags |= MS_NODIRATIME;
-	sb->s_blocksize = 1024;	/* Eh...  Is this correct? */
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = SMB_SUPER_MAGIC;
-	sb->s_op = &smb_sops;
-	sb->s_time_gran = 100;
-
-	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-	if (!server)
-		goto out_no_server;
-	sb->s_fs_info = server;
-	
-	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-		goto out_bdi;
-
-	sb->s_bdi = &server->bdi;
-
-	server->super_block = sb;
-	server->mnt = NULL;
-	server->sock_file = NULL;
-	init_waitqueue_head(&server->conn_wq);
-	init_MUTEX(&server->sem);
-	INIT_LIST_HEAD(&server->entry);
-	INIT_LIST_HEAD(&server->xmitq);
-	INIT_LIST_HEAD(&server->recvq);
-	server->conn_error = 0;
-	server->conn_pid = NULL;
-	server->state = CONN_INVALID; /* no connection yet */
-	server->generation = 0;
-
-	/* Allocate the global temp buffer and some superblock helper structs */
-	/* FIXME: move these to the smb_sb_info struct */
-	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-		sizeof(struct smb_mount_data_kernel));
-	mem = kmalloc(sizeof(struct smb_ops) +
-		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-	if (!mem)
-		goto out_no_mem;
-
-	server->ops = mem;
-	smb_install_null_ops(server->ops);
-	server->mnt = mem + sizeof(struct smb_ops);
-
-	/* Setup NLS stuff */
-	server->remote_nls = NULL;
-	server->local_nls = NULL;
-
-	mnt = server->mnt;
-
-	memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-	strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-		SMB_NLS_MAXNAMELEN);
-	strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-		SMB_NLS_MAXNAMELEN);
-
-	mnt->ttl = SMB_TTL_DEFAULT;
-	if (ver == SMB_MOUNT_OLDVERSION) {
-		mnt->version = oldmnt->version;
-
-		SET_UID(mnt->uid, oldmnt->uid);
-		SET_GID(mnt->gid, oldmnt->gid);
-
-		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-
-		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-	} else {
-		mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFREG;
-		mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-				S_IROTH | S_IXOTH | S_IFDIR;
-		if (parse_options(mnt, raw_data))
-			goto out_bad_option;
-	}
-	mnt->mounted_uid = current_uid();
-	smb_setcodepage(server, &mnt->codepage);
-
-	/*
-	 * Display the enabled options
-	 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-	 */
-	if (mnt->flags & SMB_MOUNT_OLDATTR)
-		printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-	else if (mnt->flags & SMB_MOUNT_DIRATTR)
-		printk("SMBFS: Using dir ff getattr\n");
-
-	if (smbiod_register_server(server) < 0) {
-		printk(KERN_ERR "smbfs: failed to start smbiod\n");
-		goto out_no_smbiod;
-	}
-
-	/*
-	 * Keep the super block locked while we get the root inode.
-	 */
-	smb_init_root_dirent(server, &root, sb);
-	root_inode = smb_iget(sb, &root);
-	if (!root_inode)
-		goto out_no_root;
-
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_no_root;
-
-	smb_new_dentry(sb->s_root);
-
-	return 0;
-
-out_no_root:
-	iput(root_inode);
-out_no_smbiod:
-	smb_unload_nls(server);
-out_bad_option:
-	kfree(mem);
-out_no_mem:
-	bdi_destroy(&server->bdi);
-out_bdi:
-	if (!server->mnt)
-		printk(KERN_ERR "smb_fill_super: allocation failure\n");
-	sb->s_fs_info = NULL;
-	kfree(server);
-	goto out_fail;
-out_wrong_data:
-	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-	goto out_fail;
-out_no_data:
-	printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-	return -EINVAL;
-out_no_server:
-	printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-	return -ENOMEM;
-}
-
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int result;
-	
-	lock_kernel();
-
-	result = smb_proc_dskattr(dentry, buf);
-
-	unlock_kernel();
-
-	buf->f_type = SMB_SUPER_MAGIC;
-	buf->f_namelen = SMB_MAXPATHLEN;
-	return result;
-}
-
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-	int err = smb_revalidate_inode(dentry);
-	if (!err)
-		generic_fillattr(dentry->d_inode, stat);
-	return err;
-}
-
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-	int error, changed, refresh = 0;
-	struct smb_fattr fattr;
-
-	lock_kernel();
-
-	error = smb_revalidate_inode(dentry);
-	if (error)
-		goto out;
-
-	if ((error = inode_change_ok(inode, attr)) < 0)
-		goto out;
-
-	error = -EPERM;
-	if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-		goto out;
-
-	if ((attr->ia_valid & ATTR_SIZE) != 0) {
-		VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-			DENTRY_PATH(dentry),
-			(long) inode->i_size, (long) attr->ia_size);
-
-		filemap_write_and_wait(inode->i_mapping);
-
-		error = smb_open(dentry, O_WRONLY);
-		if (error)
-			goto out;
-		error = server->ops->truncate(inode, attr->ia_size);
-		if (error)
-			goto out;
-		error = simple_setsize(inode, attr->ia_size);
-		if (error)
-			goto out;
-		refresh = 1;
-	}
-
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		/* For now we don't want to set the size with setattr_unix */
-		attr->ia_valid &= ~ATTR_SIZE;
-		/* FIXME: only call if we actually want to set something? */
-		error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-		if (!error)
-			refresh = 1;
-
-		goto out;
-	}
-
-	/*
-	 * Initialize the fattr and check for changed fields.
-	 * Note: CTIME under SMB is creation time rather than
-	 * change time, so we don't attempt to change it.
-	 */
-	smb_get_inode_attr(inode, &fattr);
-
-	changed = 0;
-	if ((attr->ia_valid & ATTR_MTIME) != 0) {
-		fattr.f_mtime = attr->ia_mtime;
-		changed = 1;
-	}
-	if ((attr->ia_valid & ATTR_ATIME) != 0) {
-		fattr.f_atime = attr->ia_atime;
-		/* Earlier protocols don't have an access time */
-		if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-			changed = 1;
-	}
-	if (changed) {
-		error = smb_proc_settime(dentry, &fattr);
-		if (error)
-			goto out;
-		refresh = 1;
-	}
-
-	/*
-	 * Check for mode changes ... we're extremely limited in
-	 * what can be set for SMB servers: just the read-only bit.
-	 */
-	if ((attr->ia_valid & ATTR_MODE) != 0) {
-		VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-			DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-		changed = 0;
-		if (attr->ia_mode & S_IWUSR) {
-			if (fattr.attr & aRONLY) {
-				fattr.attr &= ~aRONLY;
-				changed = 1;
-			}
-		} else {
-			if (!(fattr.attr & aRONLY)) {
-				fattr.attr |= aRONLY;
-				changed = 1;
-			}
-		}
-		if (changed) {
-			error = smb_proc_setattr(dentry, &fattr);
-			if (error)
-				goto out;
-			refresh = 1;
-		}
-	}
-	error = 0;
-
-out:
-	if (refresh)
-		smb_refresh_inode(dentry);
-	unlock_kernel();
-	return error;
-}
-
-static int smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-
-static struct file_system_type smb_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "smbfs",
-	.get_sb		= smb_get_sb,
-	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_BINARY_MOUNTDATA,
-};
-
-static int __init init_smb_fs(void)
-{
-	int err;
-	DEBUG1("registering ...\n");
-
-	err = init_inodecache();
-	if (err)
-		goto out_inode;
-	err = smb_init_request_cache();
-	if (err)
-		goto out_request;
-	err = register_filesystem(&smb_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	smb_destroy_request_cache();
-out_request:
-	destroy_inodecache();
-out_inode:
-	return err;
-}
-
-static void __exit exit_smb_fs(void)
-{
-	DEBUG1("unregistering ...\n");
-	unregister_filesystem(&smb_fs_type);
-	smb_destroy_request_cache();
-	destroy_inodecache();
-}
-
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad3..00000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-
-#include <asm/uaccess.h>
-
-#include "proto.h"
-
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-	struct smb_conn_opt opt;
-	int result = -EINVAL;
-
-	lock_kernel();
-	switch (cmd) {
-		uid16_t uid16;
-		uid_t uid32;
-	case SMB_IOC_GETMOUNTUID:
-		SET_UID(uid16, server->mnt->mounted_uid);
-		result = put_user(uid16, (uid16_t __user *) arg);
-		break;
-	case SMB_IOC_GETMOUNTUID32:
-		SET_UID(uid32, server->mnt->mounted_uid);
-		result = put_user(uid32, (uid_t __user *) arg);
-		break;
-
-	case SMB_IOC_NEWCONN:
-		/* arg is smb_conn_opt, or NULL if no connection was made */
-		if (!arg) {
-			result = 0;
-			smb_lock_server(server);
-			server->state = CONN_RETRIED;
-			printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-			       server->conn_error);
-			smbiod_flush(server);
-			smb_unlock_server(server);
-			break;
-		}
-
-		result = -EFAULT;
-		if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-			result = smb_newconn(server, &opt);
-		break;
-	default:
-		break;
-	}
-	unlock_kernel();
-
-	return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b..00000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <net/sock.h>
-
-#include <asm/string.h>
-#include <asm/div64.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-
-#define SMB_ST_BLKSIZE	(PAGE_SIZE)
-#define SMB_ST_BLKSHIFT	(PAGE_SHIFT)
-
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-		    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-
-
-static void
-str_upper(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'a' && *name <= 'z')
-			*name -= ('a' - 'A');
-		name++;
-	}
-}
-
-#if 0
-static void
-str_lower(char *name, int len)
-{
-	while (len--)
-	{
-		if (*name >= 'A' && *name <= 'Z')
-			*name += ('a' - 'A');
-		name++;
-	}
-}
-#endif
-
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-	char c;
-	char *end = buf+len-1;
-
-	while(buf < end) {
-		c = *buf;
-		*(buf++) = *end;
-		*(end--) = c;
-	}
-}
-
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-			  const unsigned char *input, int ilen,
-			  struct nls_table *nls_from,
-			  struct nls_table *nls_to)
-{
-	if (olen < ilen)
-		return -ENAMETOOLONG;
-	memcpy(output, input, ilen);
-	return ilen;
-}
-
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-	if (olen < 4)
-		return -ENAMETOOLONG;
-	sprintf(output, ":x%02x", ch);
-	return 4;
-}
-
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-	if (olen < 5)
-		return -ENAMETOOLONG;
-	sprintf(output, ":%04x", ch);
-	return 5;
-}
-
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-		      const unsigned char *input, int ilen,
-		      struct nls_table *nls_from,
-		      struct nls_table *nls_to)
-{
-	int len = 0;
-	int n;
-	wchar_t ch;
-
-	while (ilen > 0) {
-		/* convert by changing to unicode and back to the new cp */
-		n = nls_from->char2uni(input, ilen, &ch);
-		if (n == -EINVAL) {
-			ilen--;
-			n = write_char(*input++, output, olen);
-			if (n < 0)
-				goto fail;
-			output += n;
-			olen -= n;
-			len += n;
-			continue;
-		} else if (n < 0)
-			goto fail;
-		input += n;
-		ilen -= n;
-
-		n = nls_to->uni2char(ch, output, olen);
-		if (n == -EINVAL)
-			n = write_unichar(ch, output, olen);
-		if (n < 0)
-			goto fail;
-		output += n;
-		olen -= n;
-
-		len += n;
-	}
-	return len;
-fail:
-	return n;
-}
-
-/* ----------------------------------------------------------- */
-
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*out++ = uni & 0xff;
-	*out++ = uni >> 8;
-	return 2;
-}
-
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-	if (boundlen < 2)
-		return -EINVAL;
-	*uni = (rawstring[1] << 8) | rawstring[0];
-	return 2;
-}
-
-static struct nls_table unicode_table = {
-	.charset	= "unicode",
-	.uni2char	= uni2char,
-	.char2uni	= char2uni,
-};
-
-/* ----------------------------------------------------------- */
-
-static int setcodepage(struct nls_table **p, char *name)
-{
-	struct nls_table *nls;
-
-	if (!name || !*name) {
-		nls = NULL;
-	} else if ( (nls = load_nls(name)) == NULL) {
-		printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-		return -EINVAL;
-	}
-
-	/* if already set, unload the previous one. */
-	if (*p && *p != &unicode_table)
-		unload_nls(*p);
-	*p = nls;
-
-	return 0;
-}
-
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-	int n = 0;
-
-	smb_lock_server(server);
-
-	/* Don't load any nls_* at all, if no remote is requested */
-	if (!*cp->remote_name)
-		goto out;
-
-	/* local */
-	n = setcodepage(&server->local_nls, cp->local_name);
-	if (n != 0)
-		goto out;
-
-	/* remote */
-	if (!strcmp(cp->remote_name, "unicode")) {
-		server->remote_nls = &unicode_table;
-	} else {
-		n = setcodepage(&server->remote_nls, cp->remote_name);
-		if (n != 0)
-			setcodepage(&server->local_nls, NULL);
-	}
-
-out:
-	if (server->local_nls != NULL && server->remote_nls != NULL)
-		server->ops->convert = convert_cp;
-	else
-		server->ops->convert = convert_memcpy;
-
-	smb_unlock_server(server);
-	return n;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-	*p = 0;
-	*(p+1) = 0;
-	*(p+2) = (len & 0xFF00) >> 8;
-	*(p+3) = (len & 0xFF);
-	if (len > 0xFFFF)
-	{
-		*(p+1) = 1;
-	}
-	return p + 4;
-}
-
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-			  int maxlen,
-			  struct dentry *entry, struct qstr *name)
-{
-	unsigned char *path = buf;
-	int len;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-
-	if (maxlen < (2<<unicode))
-		return -ENAMETOOLONG;
-
-	if (maxlen > SMB_MAXPATHLEN + 1)
-		maxlen = SMB_MAXPATHLEN + 1;
-
-	if (entry == NULL)
-		goto test_name_and_out;
-
-	/*
-	 * If IS_ROOT, we have to do no walking at all.
-	 */
-	if (IS_ROOT(entry) && !name) {
-		*path++ = '\\';
-		if (unicode) *path++ = '\0';
-		*path++ = '\0';
-		if (unicode) *path++ = '\0';
-		return path-buf;
-	}
-
-	/*
-	 * Build the path string walking the tree backward from end to ROOT
-	 * and store it in reversed order [see reverse_string()]
-	 */
-	dget(entry);
-	spin_lock(&entry->d_lock);
-	while (!IS_ROOT(entry)) {
-		struct dentry *parent;
-
-		if (maxlen < (3<<unicode)) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return -ENAMETOOLONG;
-		}
-
-		len = server->ops->convert(path, maxlen-2, 
-				      entry->d_name.name, entry->d_name.len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0) {
-			spin_unlock(&entry->d_lock);
-			dput(entry);
-			return len;
-		}
-		reverse_string(path, len);
-		path += len;
-		if (unicode) {
-			/* Note: reverse order */
-			*path++ = '\0';
-			maxlen--;
-		}
-		*path++ = '\\';
-		maxlen -= len+1;
-
-		parent = entry->d_parent;
-		dget(parent);
-		spin_unlock(&entry->d_lock);
-		dput(entry);
-		entry = parent;
-		spin_lock(&entry->d_lock);
-	}
-	spin_unlock(&entry->d_lock);
-	dput(entry);
-	reverse_string(buf, path-buf);
-
-	/* maxlen has space for at least one char */
-test_name_and_out:
-	if (name) {
-		if (maxlen < (3<<unicode))
-			return -ENAMETOOLONG;
-		*path++ = '\\';
-		if (unicode) {
-			*path++ = '\0';
-			maxlen--;
-		}
-		len = server->ops->convert(path, maxlen-2, 
-				      name->name, name->len,
-				      server->local_nls, server->remote_nls);
-		if (len < 0)
-			return len;
-		path += len;
-		maxlen -= len+1;
-	}
-	/* maxlen has space for at least one char */
-	*path++ = '\0';
-	if (unicode) *path++ = '\0';
-	return path-buf;
-}
-
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-			   struct dentry *dir, struct qstr *name)
-{
-	int result;
-
-	result = smb_build_path(server, buf, maxlen, dir, name);
-	if (result < 0)
-		goto out;
-	if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-		str_upper(buf, result);
-out:
-	return result;
-}
-
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-				  struct dentry * entry, struct qstr * name)
-{
-	struct smb_sb_info *server = req->rq_server;
-	char *s = *p;
-	int res;
-	int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	if (!maxlen)
-		return -ENAMETOOLONG;
-	*s++ = 4;	/* ASCII data format */
-
-	/*
-	 * SMB Unicode strings must be 16bit aligned relative the start of the
-	 * packet. If they are not they must be padded with 0.
-	 */
-	if (unicode) {
-		int align = s - (char *)req->rq_buffer;
-		if (!(align & 1)) {
-			*s++ = '\0';
-			maxlen--;
-		}
-	}
-
-	res = smb_encode_path(server, s, maxlen-1, entry, name);
-	if (res < 0)
-		return res;
-	*p = s + res;
-	return 0;
-}
-
-/* The following are taken directly from msdos-fs */
-
-/* Linear day numbers of the respective 1sts in non-leap years. */
-
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-		  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-
-
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-	return time - server->opt.serverzone*60;
-}
-
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-	return time + server->opt.serverzone*60;
-}
-
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-	int month, year;
-	time_t secs;
-
-	/* first subtract and mask after that... Otherwise, if
-	   date == 0, bad things happen */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-	    ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-						   month < 2 ? 1 : 0) + 3653);
-	/* days since 1.1.70 plus 80's leap day */
-	return local2utc(server, secs);
-}
-
-
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-
-static void
-date_unix2dos(struct smb_sb_info *server,
-	      int unix_date, __u16 *date, __u16 *time)
-{
-	int day, year, nl_day, month;
-
-	unix_date = utc2local(server, unix_date);
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = (unix_date % 60) / 2 +
-		(((unix_date / 60) % 60) << 5) +
-		(((unix_date / 3600) % 24) << 11);
-
-	day = unix_date / 86400 - 3652;
-	year = day / 365;
-	if ((year + 3) / 4 + 365 * year > day)
-		year--;
-	day -= (year + 3) / 4 + 365 * year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
-		month = 2;
-	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day - 1;
-		for (month = 1; month < 12; month++)
-			if (day_n[month] > nl_day)
-				break;
-	}
-	*date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-
-/* The following are taken from fs/ntfs/util.c */
-
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-	struct timespec ts;
-	/* FIXME: what about the timezone difference? */
-	/* Subtract the NTFS time offset, then convert to 1s intervals. */
-	u64 t = ntutc - NTFS_TIME_OFFSET;
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t; 
-	return ts;
-}
-
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-	/* Note: timezone conversion is probably wrong. */
-	/* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-	return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-
-#define MAX_FILE_MODE	6
-static mode_t file_mode[] = {
-	S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-
-static int smb_filetype_to_mode(u32 filetype)
-{
-	if (filetype > MAX_FILE_MODE) {
-		PARANOIA("Filetype out of range: %d\n", filetype);
-		return S_IFREG;
-	}
-	return file_mode[filetype];
-}
-
-static u32 smb_filetype_from_mode(int mode)
-{
-	if (S_ISREG(mode))
-		return UNIX_TYPE_FILE;
-	if (S_ISDIR(mode))
-		return UNIX_TYPE_DIR;
-	if (S_ISLNK(mode))
-		return UNIX_TYPE_SYMLINK;
-	if (S_ISCHR(mode))
-		return UNIX_TYPE_CHARDEV;
-	if (S_ISBLK(mode))
-		return UNIX_TYPE_BLKDEV;
-	if (S_ISFIFO(mode))
-		return UNIX_TYPE_FIFO;
-	if (S_ISSOCK(mode))
-		return UNIX_TYPE_SOCKET;
-	return UNIX_TYPE_UNKNOWN;
-}
-
-
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-
-__u32
-smb_len(__u8 * p)
-{
-	return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-
-static __u16
-smb_bcc(__u8 * packet)
-{
-	int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-	return WVAL(packet, pos);
-}
-
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-
-static int
-smb_valid_packet(__u8 * packet)
-{
-	return (packet[4] == 0xff
-		&& packet[5] == 'S'
-		&& packet[6] == 'M'
-		&& packet[7] == 'B'
-		&& (smb_len(packet) + 4 == SMB_HEADER_LEN
-		    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-	if (SMB_CMD(packet) != command)
-		goto bad_command;
-	if (SMB_WCT(packet) < wct)
-		goto bad_wct;
-	if (bcc != -1 && smb_bcc(packet) < bcc)
-		goto bad_bcc;
-	return 0;
-
-bad_command:
-	printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-	       command, SMB_CMD(packet));
-	goto fail;
-bad_wct:
-	printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-	       command, wct, SMB_WCT(packet));
-	goto fail;
-bad_bcc:
-	printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-	       command, bcc, smb_bcc(packet));
-fail:
-	return -EIO;
-}
-
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-	return server->opt.max_xmit - overhead;
-}
-
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-	/* readX has 12 parameters, read has 5 */
-	int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-	/* writeX has 14 parameters, write has 5 */
-	int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-	int size = smb_get_xmitsize(server, overhead);
-
-	VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-
-	return size;
-}
-
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-	int errcls = req->rq_rcls;
-	int error  = req->rq_err;
-	char *class = "Unknown";
-
-	VERBOSE("errcls %d  code %d  from command 0x%x\n",
-		errcls, error, SMB_CMD(req->rq_header));
-
-	if (errcls == ERRDOS) {
-		switch (error) {
-		case ERRbadfunc:
-			return -EINVAL;
-		case ERRbadfile:
-		case ERRbadpath:
-			return -ENOENT;
-		case ERRnofids:
-			return -EMFILE;
-		case ERRnoaccess:
-			return -EACCES;
-		case ERRbadfid:
-			return -EBADF;
-		case ERRbadmcb:
-			return -EREMOTEIO;
-		case ERRnomem:
-			return -ENOMEM;
-		case ERRbadmem:
-			return -EFAULT;
-		case ERRbadenv:
-		case ERRbadformat:
-			return -EREMOTEIO;
-		case ERRbadaccess:
-			return -EACCES;
-		case ERRbaddata:
-			return -E2BIG;
-		case ERRbaddrive:
-			return -ENXIO;
-		case ERRremcd:
-			return -EREMOTEIO;
-		case ERRdiffdevice:
-			return -EXDEV;
-		case ERRnofiles:
-			return -ENOENT;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRfilexists:
-			return -EEXIST;
-		case ERROR_INVALID_PARAMETER:
-			return -EINVAL;
-		case ERROR_DISK_FULL:
-			return -ENOSPC;
-		case ERROR_INVALID_NAME:
-			return -ENOENT;
-		case ERROR_DIR_NOT_EMPTY:
-			return -ENOTEMPTY;
-		case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-		case ERROR_ALREADY_EXISTS:
-			return -EEXIST;
-		default:
-			class = "ERRDOS";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRSRV) {
-		switch (error) {
-		/* N.B. This is wrong ... EIO ? */
-		case ERRerror:
-			return -ENFILE;
-		case ERRbadpw:
-			return -EINVAL;
-		case ERRbadtype:
-		case ERRtimeout:
-			return -EIO;
-		case ERRaccess:
-			return -EACCES;
-		/*
-		 * This is a fatal error, as it means the "tree ID"
-		 * for this connection is no longer valid. We map
-		 * to a special error code and get a new connection.
-		 */
-		case ERRinvnid:
-			return -EBADSLT;
-		default:
-			class = "ERRSRV";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRHRD) {
-		switch (error) {
-		case ERRnowrite:
-			return -EROFS;
-		case ERRbadunit:
-			return -ENODEV;
-		case ERRnotready:
-			return -EUCLEAN;
-		case ERRbadcmd:
-		case ERRdata:
-			return -EIO;
-		case ERRbadreq:
-			return -ERANGE;
-		case ERRbadshare:
-			return -ETXTBSY;
-		case ERRlock:
-			return -EDEADLK;
-		case ERRdiskfull:
-			return -ENOSPC;
-		default:
-			class = "ERRHRD";
-			goto err_unknown;
-		}
-	} else if (errcls == ERRCMD) {
-		class = "ERRCMD";
-	} else if (errcls == SUCCESS) {
-		return 0;	/* This is the only valid 0 return */
-	}
-
-err_unknown:
-	printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-	       class, error, SMB_CMD(req->rq_header));
-	return -EIO;
-}
-
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-	int result;
-
-	req->rq_resp_wct = wct;
-	req->rq_resp_bcc = bcc;
-
-	result = smb_add_request(req);
-	if (result != 0) {
-		DEBUG1("smb_request failed\n");
-		goto out;
-	}
-
-	if (smb_valid_packet(req->rq_header) != 0) {
-		PARANOIA("invalid packet!\n");
-		goto out;
-	}
-
-	result = smb_verify(req->rq_header, command, wct, bcc);
-
-out:
-	return result;
-}
-
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-	struct file *filp;
-	struct sock *sk;
-	int error;
-
-	VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-
-	smb_lock_server(server);
-
-	/*
-	 * Make sure we don't already have a valid connection ...
-	 */
-	error = -EINVAL;
-	if (server->state == CONN_VALID)
-		goto out;
-
-	error = -EACCES;
-	if (current_uid() != server->mnt->mounted_uid &&
-	    !capable(CAP_SYS_ADMIN))
-		goto out;
-
-	error = -EBADF;
-	filp = fget(opt->fd);
-	if (!filp)
-		goto out;
-	if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-		goto out_putf;
-
-	server->sock_file = filp;
-	server->conn_pid = get_pid(task_pid(current));
-	server->opt = *opt;
-	server->generation += 1;
-	server->state = CONN_VALID;
-	error = 0;
-
-	if (server->conn_error) {
-		/*
-		 * conn_error is the returncode we originally decided to
-		 * drop the old connection on. This message should be positive
-		 * and not make people ask questions on why smbfs is printing
-		 * error messages ...
-		 */
-		printk(KERN_INFO "SMB connection re-established (%d)\n",
-		       server->conn_error);
-		server->conn_error = 0;
-	}
-
-	/*
-	 * Store the server in sock user_data (Only used by sunrpc)
-	 */
-	sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-	sk->sk_user_data = server;
-
-	/* chain into the data_ready callback */
-	server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-
-	/* check if we have an old smbmount that uses seconds for the 
-	   serverzone */
-	if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-		server->opt.serverzone /= 60;
-
-	/* now that we have an established connection we can detect the server
-	   type and enable bug workarounds */
-	if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_core);
-	else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-		install_ops(server->ops, &smb_ops_os2);
-	else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-		 (server->opt.max_xmit < 0x1000) &&
-		 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-		/* FIXME: can we kill the WIN95 flag now? */
-		server->mnt->flags |= SMB_MOUNT_WIN95;
-		VERBOSE("detected WIN95 server\n");
-		install_ops(server->ops, &smb_ops_win95);
-	} else {
-		/*
-		 * Samba has max_xmit 65535
-		 * NT4spX has max_xmit 4536 (or something like that)
-		 * win2k has ...
-		 */
-		VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-		install_ops(server->ops, &smb_ops_winNT);
-	}
-
-	/* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-	if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-		server->ops->getattr = smb_proc_getattr_core;
-	} else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-		server->ops->getattr = smb_proc_getattr_ff;
-	}
-
-	/* Decode server capabilities */
-	if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-		/* Should be ok to set this now, as no one can access the
-		   mount until the connection has been established. */
-		SB_of(server)->s_maxbytes = ~0ULL >> 1;
-		VERBOSE("LFS enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_UNICODE) {
-		server->mnt->flags |= SMB_MOUNT_UNICODE;
-		VERBOSE("Unicode enabled\n");
-	} else {
-		server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-	}
-#if 0
-	/* flags we may test for other patches ... */
-	if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-		VERBOSE("Large reads enabled\n");
-	}
-	if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-		VERBOSE("Large writes enabled\n");
-	}
-#endif
-	if (server->opt.capabilities & SMB_CAP_UNIX) {
-		struct inode *inode;
-		VERBOSE("Using UNIX CIFS extensions\n");
-		install_ops(server->ops, &smb_ops_unix);
-		inode = SB_of(server)->s_root->d_inode;
-		if (inode)
-			inode->i_op = &smb_dir_inode_operations_unix;
-	}
-
-	VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-		server->opt.protocol, server->opt.max_xmit,
-		pid_nr(server->conn_pid), server->opt.capabilities);
-
-	/* FIXME: this really should be done by smbmount. */
-	if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-		server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-	}
-
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		smb_proc_query_cifsunix(server);
-
-	server->conn_complete++;
-	wake_up_interruptible_all(&server->conn_wq);
-	return error;
-
-out:
-	smb_unlock_server(server);
-	smbiod_wake_up();
-	return error;
-
-out_putf:
-	fput(filp);
-	goto out;
-}
-
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-	__u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-	__u8 *p = req->rq_header;
-	struct smb_sb_info *server = req->rq_server;
-
-	p = smb_encode_smb_length(p, xmit_len - 4);
-
-	*p++ = 0xff;
-	*p++ = 'S';
-	*p++ = 'M';
-	*p++ = 'B';
-	*p++ = command;
-
-	memset(p, '\0', 19);
-	p += 19;
-	p += 8;
-
-	if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-		int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-		int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-			SMB_FLAGS2_EXTENDED_ATTRIBUTES;	/* EA? not really ... */
-
-		*(req->rq_header + smb_flg) = flags;
-		if (server->mnt->flags & SMB_MOUNT_UNICODE)
-			flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-		WSET(req->rq_header, smb_flg2, flags2);
-	}
-	*p++ = wct;		/* wct */
-	p += 2 * wct;
-	WSET(p, 0, bcc);
-
-	/* Include the header in the data to send */
-	req->rq_iovlen = 1;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = xmit_len - bcc;
-
-	return req->rq_buffer;
-}
-
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-	u16 bcc = p - req->rq_buffer;
-	u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-
-	WSET(pbcc, 0, bcc);
-
-	smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-			      2*SMB_WCT(req->rq_header) - 2 + bcc);
-
-	/* Include the "bytes" in the data to send */
-	req->rq_iovlen = 2;
-	req->rq_iov[1].iov_base = req->rq_buffer;
-	req->rq_iov[1].iov_len  = bcc;
-}
-
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-	      __u16 mode, off_t offset)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBlseek, 4, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, mode);
-	DSET(req->rq_header, smb_vwv2, offset);
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBlseek, 2, 0);
-	if (result < 0) {
-		result = 0;
-		goto out_free;
-	}
-
-	result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-	struct inode *ino = dentry->d_inode;
-	struct smb_inode_info *ei = SMB_I(ino);
-	int mode, read_write = 0x42, read_only = 0x40;
-	int res;
-	char *p;
-	struct smb_request *req;
-
-	/*
-	 * Attempt to open r/w, unless there are no write privileges.
-	 */
-	mode = read_write;
-	if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-		mode = read_only;
-#if 0
-	/* FIXME: why is this code not in? below we fix it so that a caller
-	   wanting RO doesn't get RW. smb_revalidate_inode does some 
-	   optimization based on access mode. tail -f needs it to be correct.
-
-	   We must open rw since we don't do the open if called a second time
-	   with different 'wish'. Is that not supported by smb servers? */
-	if (!(wish & (O_WRONLY | O_RDWR)))
-		mode = read_only;
-#endif
-
-	res = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBopen, 2, 0);
-	WSET(req->rq_header, smb_vwv0, mode);
-	WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-	res = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (res < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	res = smb_request_ok(req, SMBopen, 7, 0);
-	if (res != 0) {
-		if (mode == read_write &&
-		    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-		{
-			VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-				DENTRY_PATH(dentry), res);
-			mode = read_only;
-			req->rq_flags = 0;
-			goto retry;
-		}
-		goto out_free;
-	}
-	/* We should now have data in vwv[0..6]. */
-
-	ei->fileid = WVAL(req->rq_header, smb_vwv0);
-	ei->attr   = WVAL(req->rq_header, smb_vwv1);
-	/* smb_vwv2 has mtime */
-	/* smb_vwv4 has size  */
-	ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-	ei->open = server->generation;
-
-out_free:
-	smb_rput(req);
-out:
-	return res;
-}
-
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-	struct inode *inode = dentry->d_inode;
-	int result;
-	__u16 access;
-
-	result = -ENOENT;
-	if (!inode) {
-		printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-		       DENTRY_PATH(dentry));
-		goto out;
-	}
-
-	if (!smb_is_open(inode)) {
-		struct smb_sb_info *server = server_from_inode(inode);
-		result = 0;
-		if (!smb_is_open(inode))
-			result = smb_proc_open(server, dentry, wish);
-		if (result)
-			goto out;
-		/*
-		 * A successful open means the path is still valid ...
-		 */
-		smb_renew_times(dentry);
-	}
-
-	/*
-	 * Check whether the access is compatible with the desired mode.
-	 */
-	result = 0;
-	access = SMB_I(inode)->access;
-	if (access != wish && access != SMB_O_RDWR) {
-		PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-			 DENTRY_PATH(dentry), access, wish);
-		result = -EACCES;
-	}
-out:
-	return result;
-}
-
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-	struct smb_request *req;
-	int result = -ENOMEM;
-
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBclose, 3, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBclose, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-	struct smb_inode_info *ei = SMB_I(ino);
-	int result = 0;
-	if (smb_is_open(ino))
-	{
-		/*
-		 * We clear the open flag in advance, in case another
- 		 * process observes the value while we block below.
-		 */
-		ei->open = 0;
-
-		/*
-		 * Kludge alert: SMB timestamps are accurate only to
-		 * two seconds ... round the times to avoid needless
-		 * cache invalidations!
-		 */
-		if (ino->i_mtime.tv_sec & 1) { 
-			ino->i_mtime.tv_sec--;
-			ino->i_mtime.tv_nsec = 0; 
-		}
-		if (ino->i_atime.tv_sec & 1) {
-			ino->i_atime.tv_sec--;
-			ino->i_atime.tv_nsec = 0;
-		}
-		/*
-		 * If the file is open with write permissions,
-		 * update the time stamps to sync mtime and atime.
-		 */
-		if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-		    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-		    !(ei->access == SMB_O_RDONLY))
-		{
-			struct smb_fattr fattr;
-			smb_get_inode_attr(ino, &fattr);
-			smb_proc_setattr_ext(server, ino, &fattr);
-		}
-
-		result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-		/*
-		 * Force a revalidation after closing ... some servers
-		 * don't post the size until the file has been closed.
-		 */
-		if (server->opt.protocol < SMB_PROTOCOL_NT1)
-			ei->oldmtime = 0;
-		ei->closed = jiffies;
-	}
-	return result;
-}
-
-int
-smb_close(struct inode *ino)
-{
-	int result = 0;
-
-	if (smb_is_open(ino)) {
-		struct smb_sb_info *server = server_from_inode(ino);
-		result = smb_proc_close_inode(server, ino);
-	}
-	return result;
-}
-
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int result;
-
-	result = smb_proc_close(server, fileid, get_seconds());
-	return result;
-}
-
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = 3;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	__u16 returned_count, data_len;
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	u8 rbuf[4];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBread, 5, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-	WSET(buf, smb_vwv1, count);
-	DSET(buf, smb_vwv2, offset);
-	WSET(buf, smb_vwv4, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_read_data;
-	req->rq_buffer = rbuf;
-	req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-
-	result = smb_request_ok(req, SMBread, 5, -1);
-	if (result < 0)
-		goto out_free;
-	returned_count = WVAL(req->rq_header, smb_vwv0);
-
-	data_len = WVAL(rbuf, 1);
-
-	if (returned_count != data_len) {
-		printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-		printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-		       returned_count, data_len);
-	}
-	result = data_len;
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u16 fileid = SMB_I(inode)->fileid;
-	u8 buf[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, fileid, count, offset);
-
-	smb_setup_header(req, SMBwrite, 5, count + 3);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	WSET(req->rq_header, smb_vwv1, count);
-	DSET(req->rq_header, smb_vwv2, offset);
-	WSET(req->rq_header, smb_vwv4, 0);
-
-	buf[0] = 1;
-	WSET(buf, 1, count);	/* yes, again ... */
-	req->rq_iov[1].iov_base = buf;
-	req->rq_iov[1].iov_len = 3;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwrite, 1, 0);
-	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-	/* header length, excluding the netbios length (-4) */
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-	int data_off = WVAL(req->rq_header, smb_vwv6);
-
-	/*
-	 * Some genius made the padding to the data bytes arbitrary.
-	 * So we must first calculate the amount of padding used by the server.
-	 */
-	data_off -= hdrlen;
-	if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-		PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-		PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-		req->rq_rlen = req->rq_bufsize + 1;
-		return;
-	}
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = data_off;
-
-	req->rq_iov[1].iov_base = req->rq_page;
-	req->rq_iov[1].iov_len  = req->rq_rsize;
-	req->rq_iovlen = 2;
-
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	unsigned char *buf;
-	int result;
-	struct smb_request *req;
-	static char pad[SMB_READX_MAX_PAD];
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBreadX, 12, 0);
-	buf = req->rq_header;
-	WSET(buf, smb_vwv0, 0x00ff);
-	WSET(buf, smb_vwv1, 0);
-	WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-	WSET(buf, smb_vwv5, count);
-	WSET(buf, smb_vwv6, 0);
-	DSET(buf, smb_vwv7, 0);
-	WSET(buf, smb_vwv9, 0);
-	DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-	WSET(buf, smb_vwv11, 0);
-
-	req->rq_page = data;
-	req->rq_rsize = count;
-	req->rq_callback = smb_proc_readX_data;
-	req->rq_buffer = pad;
-	req->rq_bufsize = SMB_READX_MAX_PAD;
-	req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBreadX, 12, -1);
-	if (result < 0)
-		goto out_free;
-	result = WVAL(req->rq_header, smb_vwv5);
-
-out_free:
-	smb_rput(req);
-out:
-	VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, result);
-	return result;
-}
-
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	u8 *p;
-	static u8 pad[4];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-		inode->i_ino, SMB_I(inode)->fileid, count, offset);
-
-	p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-	WSET(req->rq_header, smb_vwv0, 0x00ff);
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-	DSET(req->rq_header, smb_vwv3, (u32)offset);	/* low 32 bits */
-	DSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv7, 0);		/* write mode */
-	WSET(req->rq_header, smb_vwv8, 0);
-	WSET(req->rq_header, smb_vwv9, 0);
-	WSET(req->rq_header, smb_vwv10, count);		/* data length */
-	WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-	DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-
-	req->rq_iov[1].iov_base = pad;
-	req->rq_iov[1].iov_len = 1;
-	req->rq_iov[2].iov_base = (char *) data;
-	req->rq_iov[2].iov_len = count;
-	req->rq_iovlen = 3;
-	req->rq_flags |= SMB_REQ_NORETRY;
-
-	result = smb_request_ok(req, SMBwriteX, 6, 0);
- 	if (result >= 0)
-		result = WVAL(req->rq_header, smb_vwv2);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBcreate, 3, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBcreate, 1, 0);
-	if (result < 0)
-		goto out_free;
-
-	*fileid = WVAL(req->rq_header, smb_vwv0);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(old_dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBmv, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-	result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, command, 0, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, command, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBmkdir);
-}
-
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-	return smb_proc_generic_command(dentry, SMBrmdir);
-}
-
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-	int result;
-	struct smb_fattr fattr;
-
-	/* FIXME: cifsUE should allow removing a readonly file. */
-
-	/* first get current attribute */
-	smb_init_dirent(server, &fattr);
-	result = server->ops->getattr(server, dentry, &fattr);
-	smb_finish_dirent(server, &fattr);
-	if (result < 0)
-		return result;
-
-	/* if RONLY attribute is set, remove it */
-	if (fattr.attr & aRONLY) {  /* read only attribute is set */
-		fattr.attr &= ~aRONLY;
-		result = smb_proc_setattr_core(server, dentry, fattr.attr);
-	}
-	return result;
-}
-#endif
-
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	int flag = 0;
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-      retry:
-	p = smb_setup_header(req, SMBunlink, 1, 0);
-	WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-		if (result == -EACCES && !flag) {
-			/* Posix semantics is for the read-only state
-			   of a file to be ignored in unlink(). In the
-			   SMB world a unlink() is refused on a
-			   read-only file. To make things easier for
-			   unix users we try to override the files
-			   permission if the unlink fails with the
-			   right error.
-			   This introduces a race condition that could
-			   lead to a file being written by someone who
-			   shouldn't have access, but as far as I can
-			   tell that is unavoidable */
-
-			/* remove RONLY attribute and try again */
-			result = smb_set_rw(dentry,server);
-			if (result == 0) {
-				flag = 1;
-				req->rq_flags = 0;
-				goto retry;
-			}
-		}
-#endif
-		goto out_free;
-	}
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBflush, 1, 0);
-	WSET(req->rq_header, smb_vwv0, fileid);
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBflush, 0, 0);
-
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-	/*
-	 * Writing 0bytes is old-SMB magic for truncating files.
-	 * MAX_NON_LFS should prevent this from being called with a too
-	 * large offset.
-	 */
-	return smb_proc_write(inode, length, 0, NULL);
-}
-
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result;
-	char *param;
-	char *data;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 14)))
-		goto out;
-
-	param = req->rq_buffer;
-	data = req->rq_buffer + 6;
-
-	/* FIXME: must we also set allocation size? winNT seems to do that */
-	WSET(param, 0, SMB_I(inode)->fileid);
-	WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-	WSET(param, 4, 0);
-	LSET(data, 0, length);
-
-	req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-	req->rq_ldata = 8;
-	req->rq_data  = data;
-	req->rq_lparm = 6;
-	req->rq_parm  = param;
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-	struct smb_sb_info *server = server_from_inode(inode);
-	int result = smb_proc_trunc32(inode, length);
- 
-	/*
-	 * win9x doesn't appear to update the size immediately.
-	 * It will return the old file size after the truncate,
-	 * confusing smbfs. So we force an update.
-	 *
-	 * FIXME: is this still necessary?
-	 */
-	smb_proc_flush(server, SMB_I(inode)->fileid);
-	return result;
-}
-
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	memset(fattr, 0, sizeof(*fattr));
-
-	fattr->f_nlink = 1;
-	fattr->f_uid = server->mnt->uid;
-	fattr->f_gid = server->mnt->gid;
-	fattr->f_unix = 0;
-}
-
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-	if (fattr->f_unix)
-		return;
-
-	fattr->f_mode = server->mnt->file_mode;
-	if (fattr->attr & aDIR) {
-		fattr->f_mode = server->mnt->dir_mode;
-		fattr->f_size = SMB_ST_BLKSIZE;
-	}
-	/* Check the read-only flag */
-	if (fattr->attr & aRONLY)
-		fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-
-	/* How many 512 byte blocks do we need for this file? */
-	fattr->f_blocks = 0;
-	if (fattr->f_size != 0)
-		fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-	return;
-}
-
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-		     struct super_block *sb)
-{
-	smb_init_dirent(server, fattr);
-	fattr->attr = aDIR;
-	fattr->f_ino = 2; /* traditional root inode number */
-	fattr->f_mtime = current_fs_time(sb);
-	smb_finish_dirent(server, fattr);
-}
-
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-			struct qstr *qname, struct smb_fattr *fattr,
-			unsigned char *name_buf)
-{
-	int len;
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	p += SMB_STATUS_SIZE;	/* reserved (search_status) */
-	fattr->attr = *p;
-	fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size = DVAL(p, 5);
-	fattr->f_ctime = fattr->f_mtime;
-	fattr->f_atime = fattr->f_mtime;
-	qname->name = p + 9;
-	len = strnlen(qname->name, 12);
-
-	/*
-	 * Trim trailing blanks for Pathworks servers
-	 */
-	while (len > 2 && qname->name[len-1] == ' ')
-		len--;
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. It kills const. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(entry->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(entry->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				   qname->name, len,
-				   server->remote_nls, server->local_nls);
-	if (len > 0) {
-		qname->len = len;
-		qname->name = name_buf;
-		DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-	}
-
-	return p + 22;
-}
-
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-		       struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-	char *p;
-	int result;
-	int i, first, entries_seen, entries;
-	int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-	__u16 bcc;
-	__u16 count;
-	char status[SMB_STATUS_SIZE];
-	static struct qstr mask = {
-		.name	= "*.*",
-		.len	= 3,
-	};
-	unsigned char *last_status;
-	struct smb_request *req;
-	unsigned char *name_buf;
-
-	VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-
-	lock_kernel();
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-		goto out;
-
-	first = 1;
-	entries = 0;
-	entries_seen = 2; /* implicit . and .. */
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-
-	while (1) {
-		p = smb_setup_header(req, SMBsearch, 2, 0);
-		WSET(req->rq_header, smb_vwv0, entries_asked);
-		WSET(req->rq_header, smb_vwv1, aDIR);
-		if (first == 1) {
-			result = smb_simple_encode_path(req, &p, dir, &mask);
-			if (result < 0)
-				goto out_free;
-			if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-			*p++ = 5;
-			WSET(p, 0, 0);
-			p += 2;
-			first = 0;
-		} else {
-			if (p + 5 + SMB_STATUS_SIZE >
-			    (char *)req->rq_buffer + req->rq_bufsize) {
-				result = -ENAMETOOLONG;
-				goto out_free;
-			}
-				
-			*p++ = 4;
-			*p++ = 0;
-			*p++ = 5;
-			WSET(p, 0, SMB_STATUS_SIZE);
-			p += 2;
-			memcpy(p, status, SMB_STATUS_SIZE);
-			p += SMB_STATUS_SIZE;
-		}
-
-		smb_setup_bcc(req, p);
-
-		result = smb_request_ok(req, SMBsearch, 1, -1);
-		if (result < 0) {
-			if ((req->rq_rcls == ERRDOS) && 
-			    (req->rq_err  == ERRnofiles))
-				break;
-			goto out_free;
-		}
-		count = WVAL(req->rq_header, smb_vwv0);
-		if (count <= 0)
-			break;
-
-		result = -EIO;
-		bcc = smb_bcc(req->rq_header);
-		if (bcc != count * SMB_DIRINFO_SIZE + 3)
-			goto out_free;
-		p = req->rq_buffer + 3;
-
-
-		/* Make sure the response fits in the buffer. Fixed sized 
-		   entries means we don't have to check in the decode loop. */
-
-		last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-
-		if (last_status + SMB_DIRINFO_SIZE >=
-		    req->rq_buffer + req->rq_bufsize) {
-			printk(KERN_ERR "smb_proc_readdir_short: "
-			       "last dir entry outside buffer! "
-			       "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-			       req->rq_bufsize, req->rq_buffer);
-			goto out_free;
-		}
-
-		/* Read the last entry into the status field. */
-		memcpy(status, last_status, SMB_STATUS_SIZE);
-
-
-		/* Now we are ready to parse smb directory entries. */
-
-		for (i = 0; i < count; i++) {
-			p = smb_decode_short_dirent(server, p, 
-						    &qname, &fattr, name_buf);
-			if (qname.len == 0)
-				continue;
-
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-	}
-	result = entries;
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-	u64 size, disk_bytes;
-
-	/* FIXME: verify nls support. all is sent as utf8? */
-
-	fattr->f_unix = 1;
-	fattr->f_mode = 0;
-
-	/* FIXME: use the uniqueID from the remote instead? */
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-
-	size = LVAL(p, 0);
-	disk_bytes = LVAL(p, 8);
-
-	/*
-	 * Some samba versions round up on-disk byte usage
-	 * to 1MB boundaries, making it useless. When seeing
-	 * that, use the size instead.
-	 */
-	if (!(disk_bytes & 0xfffff))
-		disk_bytes = size+511;
-
-	fattr->f_size = size;
-	fattr->f_blocks = disk_bytes >> 9;
-	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-
-	if (server->mnt->flags & SMB_MOUNT_UID)
-		fattr->f_uid = server->mnt->uid;
-	else
-		fattr->f_uid = LVAL(p, 40);
-
-	if (server->mnt->flags & SMB_MOUNT_GID)
-		fattr->f_gid = server->mnt->gid;
-	else
-		fattr->f_gid = LVAL(p, 48);
-
-	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-
-	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-		__u64 major = LVAL(p, 60);
-		__u64 minor = LVAL(p, 68);
-
-		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-			fattr->f_rdev = 0;
-	}
-
-	fattr->f_mode |= LVAL(p, 84);
-
-	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-	     (S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-	          !(S_ISDIR(fattr->f_mode)) )
-		fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-				(fattr->f_mode & S_IFMT);
-
-}
-
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-		       struct qstr *qname, struct smb_fattr *fattr,
-		       unsigned char *name_buf)
-{
-	char *result;
-	unsigned int len = 0;
-	int n;
-	__u16 date, time;
-	int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-
-	/*
-	 * SMB doesn't have a concept of inode numbers ...
-	 */
-	smb_init_dirent(server, fattr);
-	fattr->f_ino = 0;	/* FIXME: do we need this? */
-
-	switch (level) {
-	case 1:
-		len = *((unsigned char *) p + 22);
-		qname->name = p + 23;
-		result = p + 24 + len;
-
-		date = WVAL(p, 0);
-		time = WVAL(p, 2);
-		fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_ctime.tv_nsec = 0;
-
-		date = WVAL(p, 4);
-		time = WVAL(p, 6);
-		fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_atime.tv_nsec = 0;
-
-		date = WVAL(p, 8);
-		time = WVAL(p, 10);
-		fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-		fattr->f_mtime.tv_nsec = 0;
-		fattr->f_size = DVAL(p, 12);
-		/* ULONG allocation size */
-		fattr->attr = WVAL(p, 20);
-
-		VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case 260:
-		result = p + WVAL(p, 0);
-		len = DVAL(p, 60);
-		if (len > 255) len = 255;
-		/* NT4 null terminates, unless we are using unicode ... */
-		qname->name = p + 94;
-		if (!unicode && len && qname->name[len-1] == '\0')
-			len--;
-
-		fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-		fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-		fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-		/* change time (32) */
-		fattr->f_size = LVAL(p, 40);
-		/* alloc size (48) */
-		fattr->attr = DVAL(p, 56);
-
-		VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	case SMB_FIND_FILE_UNIX:
-		result = p + WVAL(p, 0);
-		qname->name = p + 108;
-
-		len = strlen(qname->name);
-		/* FIXME: should we check the length?? */
-
-		p += 8;
-		smb_decode_unix_basic(fattr, server, p);
-		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-			p, len, len, qname->name);
-		break;
-	default:
-		PARANOIA("Unknown info level %d\n", level);
-		result = p + WVAL(p, 0);
-		goto out;
-	}
-
-	smb_finish_dirent(server, fattr);
-
-#if 0
-	/* FIXME: These only work for ascii chars, and recent smbmount doesn't
-	   allow the flag to be set anyway. Remove? */
-	switch (server->opt.case_handling) {
-	case SMB_CASE_UPPER:
-		str_upper(qname->name, len);
-		break;
-	case SMB_CASE_LOWER:
-		str_lower(qname->name, len);
-		break;
-	default:
-		break;
-	}
-#endif
-
-	qname->len = 0;
-	n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-				 qname->name, len,
-				 server->remote_nls, server->local_nls);
-	if (n > 0) {
-		qname->len = n;
-		qname->name = name_buf;
-	}
-
-out:
-	return result;
-}
-
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct dentry *dir = filp->f_path.dentry;
-	struct smb_sb_info *server = server_from_dentry(dir);
-	struct qstr qname;
-	struct smb_fattr fattr;
-
-	unsigned char *p, *lastname;
-	char *mask, *param;
-	__u16 command;
-	int first, entries_seen;
-
-	/* Both NT and OS/2 accept info level 1 (but see note below). */
-	int info_level = 260;
-	const int max_matches = 512;
-
-	unsigned int ff_searchcount = 0;
-	unsigned int ff_eos = 0;
-	unsigned int ff_lastname = 0;
-	unsigned int ff_dir_handle = 0;
-	unsigned int loop_count = 0;
-	unsigned int mask_len, i;
-	int result;
-	struct smb_request *req;
-	unsigned char *name_buf;
-	static struct qstr star = {
-		.name	= "*",
-		.len	= 1,
-	};
-
-	lock_kernel();
-
-	/*
-	 * We always prefer unix style. Use info level 1 for older
-	 * servers that don't do 260.
-	 */
-	if (server->opt.capabilities & SMB_CAP_UNIX)
-		info_level = SMB_FIND_FILE_UNIX;
-	else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-		info_level = 1;
-
-	result = -ENOMEM;
-	if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-		goto out;
-	if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-		goto out_name;
-	param = req->rq_buffer;
-
-	/*
-	 * Encode the initial path
-	 */
-	mask = param + 12;
-
-	result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-	if (result <= 0)
-		goto out_free;
-	mask_len = result - 1;	/* mask_len is strlen, not #bytes */
-	result = 0;
-	first = 1;
-	VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-
-	entries_seen = 2;
-	ff_eos = 0;
-
-	while (ff_eos == 0) {
-		loop_count += 1;
-		if (loop_count > 10) {
-			printk(KERN_WARNING "smb_proc_readdir_long: "
-			       "Looping in FIND_NEXT??\n");
-			result = -EIO;
-			break;
-		}
-
-		if (first != 0) {
-			command = TRANSACT2_FINDFIRST;
-			WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, SMB_CLOSE_IF_END);
-			WSET(param, 6, info_level);
-			DSET(param, 8, 0);
-		} else {
-			command = TRANSACT2_FINDNEXT;
-
-			VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-				ff_dir_handle, ff_lastname, mask_len, mask);
-
-			WSET(param, 0, ff_dir_handle);	/* search handle */
-			WSET(param, 2, max_matches);	/* max count */
-			WSET(param, 4, info_level);
-			DSET(param, 6, 0);
-			WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-		}
-
-		req->rq_trans2_command = command;
-		req->rq_ldata = 0;
-		req->rq_data  = NULL;
-		req->rq_lparm = 12 + mask_len + 1;
-		req->rq_parm  = param;
-		req->rq_flags = 0;
-		result = smb_add_request(req);
-		if (result < 0) {
-			PARANOIA("error=%d, breaking\n", result);
-			break;
-		}
-
-		if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-			/* a damn Win95 bug - sometimes it clags if you 
-			   ask it too fast */
-			schedule_timeout_interruptible(msecs_to_jiffies(200));
-			continue;
-                }
-
-		if (req->rq_rcls != 0) {
-			result = smb_errno(req);
-			PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-				 mask, result, req->rq_rcls, req->rq_err);
-			break;
-		}
-
-		/* parse out some important return info */
-		if (first != 0) {
-			ff_dir_handle = WVAL(req->rq_parm, 0);
-			ff_searchcount = WVAL(req->rq_parm, 2);
-			ff_eos = WVAL(req->rq_parm, 4);
-			ff_lastname = WVAL(req->rq_parm, 8);
-		} else {
-			ff_searchcount = WVAL(req->rq_parm, 0);
-			ff_eos = WVAL(req->rq_parm, 2);
-			ff_lastname = WVAL(req->rq_parm, 6);
-		}
-
-		if (ff_searchcount == 0)
-			break;
-
-		/* Now we are ready to parse smb directory entries. */
-
-		/* point to the data bytes */
-		p = req->rq_data;
-		for (i = 0; i < ff_searchcount; i++) {
-			/* make sure we stay within the buffer */
-			if (p >= req->rq_data + req->rq_ldata) {
-				printk(KERN_ERR "smb_proc_readdir_long: "
-				       "dirent pointer outside buffer! "
-				       "%p  %d@%p\n",
-				       p, req->rq_ldata, req->rq_data);
-				result = -EIO; /* always a comm. error? */
-				goto out_free;
-			}
-
-			p = smb_decode_long_dirent(server, p, info_level,
-						   &qname, &fattr, name_buf);
-
-			/* ignore . and .. from the server */
-			if (entries_seen == 2 && qname.name[0] == '.') {
-				if (qname.len == 1)
-					continue;
-				if (qname.name[1] == '.' && qname.len == 2)
-					continue;
-			}
-
-			if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-					    &qname, &fattr))
-				;	/* stop reading? */
-			entries_seen++;
-		}
-
-		VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-
-		/*
-		 * We might need the lastname for continuations.
-		 *
-		 * Note that some servers (win95?) point to the filename and
-		 * others (NT4, Samba using NT1) to the dir entry. We assume
-		 * here that those who do not point to a filename do not need
-		 * this info to continue the listing.
-		 *
-		 * OS/2 needs this and talks infolevel 1.
-		 * NetApps want lastname with infolevel 260.
-		 * win2k want lastname with infolevel 260, and points to
-		 *       the record not to the name.
-		 * Samba+CifsUnixExt doesn't need lastname.
-		 *
-		 * Both are happy if we return the data they point to. So we do.
-		 * (FIXME: above is not true with win2k)
-		 */
-		mask_len = 0;
-		if (info_level != SMB_FIND_FILE_UNIX &&
-		    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-			lastname = req->rq_data + ff_lastname;
-
-			switch (info_level) {
-			case 260:
-				mask_len = req->rq_ldata - ff_lastname;
-				break;
-			case 1:
-				/* lastname points to a length byte */
-				mask_len = *lastname++;
-				if (ff_lastname + 1 + mask_len > req->rq_ldata)
-					mask_len = req->rq_ldata - ff_lastname - 1;
-				break;
-			}
-
-			/*
-			 * Update the mask string for the next message.
-			 */
-			if (mask_len > 255)
-				mask_len = 255;
-			if (mask_len)
-				strncpy(mask, lastname, mask_len);
-		}
-		mask_len = strnlen(mask, mask_len);
-		VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-			mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-
-		first = 0;
-		loop_count = 0;
-	}
-
-out_free:
-	smb_rput(req);
-out_name:
-	kfree(name_buf);
-out:
-	unlock_kernel();
-	return result;
-}
-
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-			struct smb_fattr *fattr)
-{
-	char *param, *mask;
-	__u16 date, time;
-	int mask_len, result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-	mask = param + 12;
-
-	mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-	if (mask_len < 0) {
-		result = mask_len;
-		goto out_free;
-	}
-	VERBOSE("name=%s, len=%d\n", mask, mask_len);
-	WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-	WSET(param, 2, 1);	/* max count */
-	WSET(param, 4, 1);	/* close after this call */
-	WSET(param, 6, 1);	/* info_level */
-	DSET(param, 8, 0);
-
-	req->rq_trans2_command = TRANSACT2_FINDFIRST;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 12 + mask_len;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	if (req->rq_rcls != 0) {
-		result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-		if (result != -ENOENT)
-			PARANOIA("error for %s, rcls=%d, err=%d\n",
-				 mask, req->rq_rcls, req->rq_err);
-#endif
-		goto out_free;
-	}
-	/* Make sure we got enough data ... */
-	result = -EINVAL;
-	if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-		PARANOIA("bad result for %s, len=%d, count=%d\n",
-			 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-		goto out_free;
-	}
-
-	/*
-	 * Decode the response into the fattr ...
-	 */
-	date = WVAL(req->rq_data, 0);
-	time = WVAL(req->rq_data, 2);
-	fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4);
-	time = WVAL(req->rq_data, 6);
-	fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8);
-	time = WVAL(req->rq_data, 10);
-	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	fattr->f_mtime.tv_nsec = 0;
-	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-		mask, date, time, fattr->f_mtime.tv_sec);
-	fattr->f_size = DVAL(req->rq_data, 12);
-	/* ULONG allocation size */
-	fattr->attr = WVAL(req->rq_data, 20);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *fattr)
-{
-	int result;
-	char *p;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBgetatr, 0, 0);
-	result = smb_simple_encode_path(req, &p, dir, NULL);
-	if (result < 0)
- 		goto out_free;
-	smb_setup_bcc(req, p);
-
-	if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-		goto out_free;
-	fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-	fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-	fattr->f_mtime.tv_nsec = 0;
-	fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-	fattr->f_ctime = fattr->f_mtime; 
-	fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk("getattr_core: %s/%s, mtime=%ld\n",
-	       DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-			struct smb_request *req, int infolevel)
-{
-	char *p, *param;
-	int result;
-
-	param = req->rq_buffer;
-	WSET(param, 0, infolevel);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out;
-	if (req->rq_rcls != 0) {
-		VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-			&param[6], result, req->rq_rcls, req->rq_err);
-		result = smb_errno(req);
-		goto out;
-	}
-	result = -ENOENT;
-	if (req->rq_ldata < 22) {
-		PARANOIA("not enough data for %s, len=%d\n",
-			 &param[6], req->rq_ldata);
-		goto out;
-	}
-
-	result = 0;
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	u16 date, time;
-	int off_date = 0, off_time = 2;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-	if (result < 0)
-		goto out_free;
-
-	/*
-	 * Kludge alert: Win 95 swaps the date and time field,
-	 * contrary to the CIFS docs and Win NT practice.
-	 */
-	if (server->mnt->flags & SMB_MOUNT_WIN95) {
-		off_date = 2;
-		off_time = 0;
-	}
-	date = WVAL(req->rq_data, off_date);
-	time = WVAL(req->rq_data, off_time);
-	attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_ctime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 4 + off_date);
-	time = WVAL(req->rq_data, 4 + off_time);
-	attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_atime.tv_nsec = 0;
-
-	date = WVAL(req->rq_data, 8 + off_date);
-	time = WVAL(req->rq_data, 8 + off_time);
-	attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-	attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-	       DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-	attr->f_size = DVAL(req->rq_data, 12);
-	attr->attr = WVAL(req->rq_data, 20);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-			    struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_ALL_INFO);
-	if (result < 0)
-		goto out_free;
-
-	attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-	attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-	attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-	/* change (24) */
-	attr->attr = WVAL(req->rq_data, 32);
-	/* pad? (34) */
-	/* allocated size (40) */
-	attr->f_size = LVAL(req->rq_data, 48);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-		      struct smb_fattr *attr)
-{
-	struct smb_request *req;
-	int result;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	result = smb_proc_getattr_trans2(server, dir, req,
-					 SMB_QUERY_FILE_UNIX_BASIC);
-	if (result < 0)
-		goto out_free;
-
-	smb_decode_unix_basic(attr, server, req->rq_data);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-		    struct smb_fattr *attr)
-{
-	struct inode *inode = dir->d_inode;
-	int result;
-
-	/* FIXME: why not use the "all" version? */
-	result = smb_proc_getattr_trans2_std(server, dir, attr);
-	if (result < 0)
-		goto out;
-
-	/*
-	 * None of the getattr versions here can make win9x return the right
-	 * filesize if there are changes made to an open file.
-	 * A seek-to-end does return the right size, but we only need to do
-	 * that on files we have written.
-	 */
-	if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-	    smb_is_open(inode))
-	{
-		__u16 fileid = SMB_I(inode)->fileid;
-		attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-	}
-
-out:
-	return result;
-}
-
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-	int result;
-
-	result = wait_event_interruptible_timeout(server->conn_wq,
-				server->conn_complete, 30*HZ);
-
-	if (!result || signal_pending(current))
-		return -EIO;
-
-	return 0;
-}
-
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-			  struct smb_fattr *fattr)
-{
-	int result;
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-		      struct smb_cache_control *ctl)
-{
-	struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-
-	if (smb_proc_ops_wait(server) < 0)
-		return -EIO;
-
-	return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	smb_init_dirent(server, fattr);
-	result = server->ops->getattr(server, dir, fattr);
-	smb_finish_dirent(server, fattr);
-
-	return result;
-}
-
-
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-		      __u16 attr)
-{
-	char *p;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-
-	p = smb_setup_header(req, SMBsetatr, 8, 0);
-	WSET(req->rq_header, smb_vwv0, attr);
-	DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-	WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-	WSET(req->rq_header, smb_vwv4, 0);
-	WSET(req->rq_header, smb_vwv5, 0);
-	WSET(req->rq_header, smb_vwv6, 0);
-	WSET(req->rq_header, smb_vwv7, 0);
-	result = smb_simple_encode_path(req, &p, dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-		result = -ENAMETOOLONG;
-		goto out_free;
-	}
-	*p++ = 4;
-	*p++ = 0;
-	smb_setup_bcc(req, p);
-
-	result = smb_request_ok(req, SMBsetatr, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dir);
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n", 
-		DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-	result = smb_proc_setattr_core(server, dir, fattr->attr);
-	return result;
-}
-
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-		      struct inode *inode, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBsetattrE, 7, 0);
-	WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-	/* We don't change the creation time */
-	WSET(req->rq_header, smb_vwv1, 0);
-	WSET(req->rq_header, smb_vwv2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv3, date);
-	WSET(req->rq_header, smb_vwv4, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(req->rq_header, smb_vwv5, date);
-	WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-	       date, time, fattr->f_mtime);
-#endif
-
-	req->rq_flags |= SMB_REQ_NORETRY;
-	result = smb_request_ok(req, SMBsetattrE, 0, 0);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-			struct dentry *dir, struct smb_fattr *fattr)
-{
-	__u16 date, time;
-	char *p, *param;
-	int result;
-	char data[26];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, 1);	/* Info level SMB_INFO_STANDARD */
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	WSET(data, 0, 0); /* creation time */
-	WSET(data, 2, 0);
-	date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-	WSET(data, 4, date);
-	WSET(data, 6, time);
-	date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-	WSET(data, 8, date);
-	WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-	printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-	       DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-	DSET(data, 12, 0); /* size */
-	DSET(data, 16, 0); /* blksize */
-	WSET(data, 20, 0); /* attr */
-	DSET(data, 22, 0); /* ULONG EA size */
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 26;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	result = 0;
-	if (req->rq_rcls != 0)
-		result = smb_errno(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200	
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-		      unsigned int major, unsigned int minor)
-{
-	struct smb_sb_info *server = server_from_dentry(d);
-	u64 nttime;
-	char *p, *param;
-	int result;
-	char data[100];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* 0 L file size in bytes */
-	/* 8 L file size on disk in bytes (block count) */
-	/* 40 L uid */
-	/* 48 L gid */
-	/* 56 W file type enum */
-	/* 60 L devmajor */
-	/* 68 L devminor */
-	/* 76 L unique ID (inode) */
-	/* 84 L permissions */
-	/* 92 L link count */
-	LSET(data, 0, SMB_SIZE_NO_CHANGE);
-	LSET(data, 8, SMB_SIZE_NO_CHANGE);
-	LSET(data, 16, SMB_TIME_NO_CHANGE);
-	LSET(data, 24, SMB_TIME_NO_CHANGE);
-	LSET(data, 32, SMB_TIME_NO_CHANGE);
-	LSET(data, 40, SMB_UID_NO_CHANGE);
-	LSET(data, 48, SMB_GID_NO_CHANGE);
-	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-	LSET(data, 60, major);
-	LSET(data, 68, minor);
-	LSET(data, 76, 0);
-	LSET(data, 84, SMB_MODE_NO_CHANGE);
-	LSET(data, 92, 0);
-
-	if (attr->ia_valid & ATTR_SIZE) {
-		LSET(data, 0, attr->ia_size);
-		LSET(data, 8, 0); /* can't set anyway */
-	}
-
-	/*
-	 * FIXME: check the conversion function it the correct one
-	 *
-	 * we can't set ctime but we might as well pass this to the server
-	 * and let it ignore it.
-	 */
-	if (attr->ia_valid & ATTR_CTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_ctime);
-		LSET(data, 16, nttime);
-	}
-	if (attr->ia_valid & ATTR_ATIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_atime);
-		LSET(data, 24, nttime);
-	}
-	if (attr->ia_valid & ATTR_MTIME) {
-		nttime = smb_unixutc2ntutc(attr->ia_mtime);
-		LSET(data, 32, nttime);
-	}
-	
-	if (attr->ia_valid & ATTR_UID) {
-		LSET(data, 40, attr->ia_uid);
-	}
-	if (attr->ia_valid & ATTR_GID) {
-		LSET(data, 48, attr->ia_gid); 
-	}
-	
-	if (attr->ia_valid & ATTR_MODE) {
-		LSET(data, 84, attr->ia_mode);
-	}
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = 100;
-	req->rq_data  = data;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-	struct smb_sb_info *server = server_from_dentry(dentry);
-	struct inode *inode = dentry->d_inode;
-	int result;
-
-	VERBOSE("setting %s/%s, open=%d\n",
-		DENTRY_PATH(dentry), smb_is_open(inode));
-
-	/* setting the time on a Win95 server fails (tridge) */
-	if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-	    !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-		if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-			result = smb_proc_setattr_ext(server, inode, fattr);
-		else
-			result = smb_proc_setattr_trans2(server, dentry, fattr);
-	} else {
-		/*
-		 * Fail silently on directories ... timestamp can't be set?
-		 */
-		result = 0;
-		if (S_ISREG(inode->i_mode)) {
-			/*
-			 * Set the mtime by opening and closing the file.
-			 * Note that the file is opened read-only, but this
-			 * still allows us to set the date (tridge)
-			 */
-			result = -EACCES;
-			if (!smb_is_open(inode))
-				smb_proc_open(server, dentry, SMB_O_RDONLY);
-			if (smb_is_open(inode)) {
-				inode->i_mtime = fattr->f_mtime;
-				result = smb_proc_close_inode(server, inode);
-			}
-		}
-	}
-
-	return result;
-}
-
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-	int result;
-	char *p;
-	long unit;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 0)))
-		goto out;
-
-	smb_setup_header(req, SMBdskattr, 0, 0);
-	if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-		goto out_free;
-	p = SMB_VWV(req->rq_header);
-	unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-	attr->f_blocks = WVAL(p, 0) * unit;
-	attr->f_bsize  = SMB_ST_BLKSIZE;
-	attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-		   char *buffer, int len)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_QPATHINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-
-	/* copy data up to the \0 or buffer length */
-	result = len;
-	if (req->rq_ldata < len)
-		result = req->rq_ldata;
-	strncpy(buffer, req->rq_data, result);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-		 const char *oldpath)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_ldata = strlen(oldpath) + 1;
-	req->rq_data  = (char *) oldpath;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-		&param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-	      struct dentry *new_dentry)
-{
-	char *p, *param;
-	int result;
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-		goto out;
-	param = req->rq_buffer;
-
-	WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-	DSET(param, 2, 0);
-	result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-				 new_dentry, NULL);
-	if (result < 0)
-		goto out_free;
-	p = param + 6 + result;
-
-	/* Grr, pointless separation of parameters and data ... */
-	req->rq_data = p;
-	req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-					dentry, NULL);
-
-	req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-	req->rq_lparm = p - param;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-	       &param[6], result, req->rq_rcls, req->rq_err);
-	result = 0;
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-	int result;
-	int major, minor;
-	u64 caps;
-	char param[2];
-	struct smb_request *req;
-
-	result = -ENOMEM;
-	if (! (req = smb_alloc_request(server, 100)))
-		goto out;
-
-	WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-
-	req->rq_trans2_command = TRANSACT2_QFSINFO;
-	req->rq_ldata = 0;
-	req->rq_data  = NULL;
-	req->rq_lparm = 2;
-	req->rq_parm  = param;
-	req->rq_flags = 0;
-	result = smb_add_request(req);
-	if (result < 0)
-		goto out_free;
-
-	if (req->rq_ldata < 12) {
-		PARANOIA("Not enough data\n");
-		goto out_free;
-	}
-	major = WVAL(req->rq_data, 0);
-	minor = WVAL(req->rq_data, 2);
-
-	DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-	       major, minor);
-	/* FIXME: verify that we are ok with this major/minor? */
-
-	caps = LVAL(req->rq_data, 4);
-	DEBUG1("Server capabilities 0x%016llx\n", caps);
-
-out_free:
-	smb_rput(req);
-out:
-	return result;
-}
-
-
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-	memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_short,
-	.getattr	= smb_proc_getattr_core,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-	.read		= smb_proc_read,
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_std,
-	.truncate	= smb_proc_trunc32,
-};
-
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-	.read		= smb_proc_read,    /* does not support 12word readX */
-	.write		= smb_proc_write,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_95,
-	.truncate	= smb_proc_trunc95,
-};
-
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_trans2_all,
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-	.read		= smb_proc_readX,
-	.write		= smb_proc_writeX,
-	.readdir	= smb_proc_readdir_long,
-	.getattr	= smb_proc_getattr_unix,
-	/* FIXME: core/ext/time setattr needs to be cleaned up! */
-	/* .setattr	= smb_proc_setattr_unix, */
-	.truncate	= smb_proc_trunc64,
-};
-
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-	.readdir	= smb_proc_readdir_null,
-	.getattr	= smb_proc_getattr_null,
-};
-
-void smb_install_null_ops(struct smb_ops *ops)
-{
-	install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e..00000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-
-struct smb_request;
-struct sock;
-struct statfs;
-
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-				 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e86..00000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-/* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG	0
-
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-
-static int smb_request_send_req(struct smb_request *req);
-
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-
-
-int smb_init_request_cache(void)
-{
-	req_cachep = kmem_cache_create("smb_request",
-				       sizeof(struct smb_request), 0,
-				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-				       NULL);
-	if (req_cachep == NULL)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void smb_destroy_request_cache(void)
-{
-	kmem_cache_destroy(req_cachep);
-}
-
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-						int bufsize)
-{
-	struct smb_request *req;
-	unsigned char *buf = NULL;
-
-	req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-	VERBOSE("allocating request: %p\n", req);
-	if (!req)
-		goto out;
-
-	if (bufsize > 0) {
-		buf = kmalloc(bufsize, GFP_NOFS);
-		if (!buf) {
-			kmem_cache_free(req_cachep, req);
-			return NULL;
-		}
-	}
-
-	req->rq_buffer = buf;
-	req->rq_bufsize = bufsize;
-	req->rq_server = server;
-	init_waitqueue_head(&req->rq_wait);
-	INIT_LIST_HEAD(&req->rq_queue);
-	atomic_set(&req->rq_count, 1);
-
-out:
-	return req;
-}
-
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-	struct smb_request *req = NULL;
-
-	for (;;) {
-		atomic_inc(&server->nr_requests);
-		if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-			req = smb_do_alloc_request(server, bufsize);
-			if (req != NULL)
-				break;
-		}
-
-#if 0
-		/*
-		 * Try to free up at least one request in order to stay
-		 * below the hard limit
-		 */
-                if (nfs_try_to_free_pages(server))
-			continue;
-
-		if (fatal_signal_pending(current))
-			return ERR_PTR(-ERESTARTSYS);
-		current->policy = SCHED_YIELD;
-		schedule();
-#else
-		/* FIXME: we want something like nfs does above, but that
-		   requires changes to all callers and can wait. */
-		break;
-#endif
-	}
-	return req;
-}
-
-static void smb_free_request(struct smb_request *req)
-{
-	atomic_dec(&req->rq_server->nr_requests);
-	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-		kfree(req->rq_buffer);
-	kfree(req->rq_trans2buffer);
-	kmem_cache_free(req_cachep, req);
-}
-
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-	atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-	if (atomic_dec_and_test(&req->rq_count)) {
-		list_del_init(&req->rq_queue);
-		smb_free_request(req);
-	}
-}
-
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-	int result = 0;
-	req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-
-	if (req->rq_rlen > req->rq_bufsize) {
-		PARANOIA("Packet too large %d > %d\n",
-			 req->rq_rlen, req->rq_bufsize);
-		return -ENOBUFS;
-	}
-
-	req->rq_iov[0].iov_base = req->rq_buffer;
-	req->rq_iov[0].iov_len  = req->rq_rlen;
-	req->rq_iovlen = 1;
-
-	return result;
-}
-
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-	int len = smb_len(req->rq_header) + 4;
-	req->rq_slen = len;
-
-	/* if we expect a data part in the reply we set the iov's to read it */
-	if (req->rq_resp_bcc)
-		req->rq_setup_read = smb_setup_bcc;
-
-	/* This tries to support re-using the same request */
-	req->rq_bytes_sent = 0;
-	req->rq_rcls = 0;
-	req->rq_err = 0;
-	req->rq_errno = 0;
-	req->rq_fragment = 0;
-	kfree(req->rq_trans2buffer);
-	req->rq_trans2buffer = NULL;
-
-	return 0;
-}
-
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int mparam, mdata;
-	static unsigned char padding[4];
-
-	/* I know the following is very ugly, but I want to build the
-	   smb packet as efficiently as possible. */
-
-	const int smb_parameters = 15;
-	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-	const int oparam = ALIGN(header + 3, sizeof(u32));
-	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-	const int bcc = (req->rq_data ? odata + req->rq_ldata :
-					oparam + req->rq_lparm) - header;
-
-	if ((bcc + oparam) > server->opt.max_xmit)
-		return -ENOMEM;
-	smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-
-	/*
-	 * max parameters + max data + max setup == bufsize to make NT4 happy
-	 * and not abort the transfer or split into multiple responses. It also
-	 * makes smbfs happy as handling packets larger than the buffer size
-	 * is extra work.
-	 *
-	 * OS/2 is probably going to hate me for this ...
-	 */
-	mparam = SMB_TRANS2_MAX_PARAM;
-	mdata = req->rq_bufsize - mparam;
-
-	mdata = server->opt.max_xmit - mparam - 100;
-	if (mdata < 1024) {
-		mdata = 1024;
-		mparam = 20;
-	}
-
-#if 0
-	/* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-	   to return as one SMB. Useful for testing the fragmented trans2
-	   handling. */
-	mdata = 8192;
-#endif
-
-	WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_mprcnt, mparam);
-	WSET(req->rq_header, smb_mdrcnt, mdata);
-	WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-	WSET(req->rq_header, smb_flags, 0);
-	DSET(req->rq_header, smb_timeout, 0);
-	WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-	WSET(req->rq_header, smb_psoff, oparam - 4);
-	WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-	WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-	*(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-	*(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-	WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-
-	req->rq_iovlen = 2;
-	req->rq_iov[0].iov_base = (void *) req->rq_header;
-	req->rq_iov[0].iov_len = oparam;
-	req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-	req->rq_iov[1].iov_len = req->rq_lparm;
-	req->rq_slen = oparam + req->rq_lparm;
-
-	if (req->rq_data) {
-		req->rq_iovlen += 2;
-		req->rq_iov[2].iov_base = padding;
-		req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-		req->rq_iov[3].iov_base = req->rq_data;
-		req->rq_iov[3].iov_len = req->rq_ldata;
-		req->rq_slen = odata + req->rq_ldata;
-	}
-
-	/* always a data part for trans2 replies */
-	req->rq_setup_read = smb_setup_bcc;
-
-	return 0;
-}
-
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-	long timeleft;
-	struct smb_sb_info *server = req->rq_server;
-	int result = 0;
-
-	smb_setup_request(req);
-	if (req->rq_trans2_command) {
-		if (req->rq_buffer == NULL) {
-			PARANOIA("trans2 attempted without response buffer!\n");
-			return -EIO;
-		}
-		result = smb_setup_trans2request(req);
-	}
-	if (result < 0)
-		return result;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_xmit_stats(req);
-#endif
-
-	/* add 'req' to the queue of requests */
-	if (smb_lock_server_interruptible(server))
-		return -EINTR;
-
-	/*
-	 * Try to send the request as the process. If that fails we queue the
-	 * request and let smbiod send it later.
-	 */
-
-	/* FIXME: each server has a number on the maximum number of parallel
-	   requests. 10, 50 or so. We should not allow more requests to be
-	   active. */
-	if (server->mid > 0xf000)
-		server->mid = 0;
-	req->rq_mid = server->mid++;
-	WSET(req->rq_header, smb_mid, req->rq_mid);
-
-	result = 0;
-	if (server->state == CONN_VALID) {
-		if (list_empty(&server->xmitq))
-			result = smb_request_send_req(req);
-		if (result < 0) {
-			/* Connection lost? */
-			server->conn_error = result;
-			server->state = CONN_INVALID;
-		}
-	}
-	if (result != 1)
-		list_add_tail(&req->rq_queue, &server->xmitq);
-	smb_rget(req);
-
-	if (server->state != CONN_VALID)
-		smbiod_retry(server);
-
-	smb_unlock_server(server);
-
-	smbiod_wake_up();
-
-	timeleft = wait_event_interruptible_timeout(req->rq_wait,
-				    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-	if (!timeleft || signal_pending(current)) {
-		/*
-		 * On timeout or on interrupt we want to try and remove the
-		 * request from the recvq/xmitq.
-		 * First check if the request is still part of a queue. (May
-		 * have been removed by some error condition)
-		 */
-		smb_lock_server(server);
-		if (!list_empty(&req->rq_queue)) {
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-		}
-		smb_unlock_server(server);
-	}
-
-	if (!timeleft) {
-		PARANOIA("request [%p, mid=%d] timed out!\n",
-			 req, req->rq_mid);
-		VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-		VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-		VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-		VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-		VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-		VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-		VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-		VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-
-		req->rq_rcls = ERRSRV;
-		req->rq_err  = ERRtimeout;
-
-		/* Just in case it was "stuck" */
-		smbiod_wake_up();
-	}
-	VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-
-	if (req->rq_rcls != 0)
-		req->rq_errno = smb_errno(req);
-	if (signal_pending(current))
-		req->rq_errno = -ERESTARTSYS;
-	return req->rq_errno;
-}
-
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	int result;
-
-	if (req->rq_bytes_sent == 0) {
-		WSET(req->rq_header, smb_tid, server->opt.tid);
-		WSET(req->rq_header, smb_pid, 1);
-		WSET(req->rq_header, smb_uid, server->opt.server_uid);
-	}
-
-	result = smb_send_request(req);
-	if (result < 0 && result != -EAGAIN)
-		goto out;
-
-	result = 0;
-	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-		goto out;
-
-	list_move_tail(&req->rq_queue, &server->recvq);
-	result = 1;
-out:
-	return result;
-}
-
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	int result;
-
-	if (server->state != CONN_VALID)
-		return 0;
-
-	/* dequeue first request, if any */
-	req = NULL;
-	head = server->xmitq.next;
-	if (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-	}
-	if (!req)
-		return 0;
-
-	result = smb_request_send_req(req);
-	if (result < 0) {
-		server->conn_error = result;
-		list_move(&req->rq_queue, &server->xmitq);
-		result = -EIO;
-		goto out;
-	}
-
-out:
-	return result;
-}
-
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-	struct list_head *tmp;
-	struct smb_request *req = NULL;
-
-	list_for_each(tmp, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		if (req->rq_mid == mid) {
-			break;
-		}
-		req = NULL;
-	}
-
-	if (!req) {
-		VERBOSE("received reply with mid %d but no request!\n",
-			WVAL(server->header, smb_mid));
-		server->rstate = SMB_RECV_DROP;
-	}
-
-	return req;
-}
-
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-	int hdrlen, wct;
-
-	memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-
-	wct = *(req->rq_header + smb_wct);
-	if (wct > 20) {	
-		PARANOIA("wct too large, %d > 20\n", wct);
-		server->rstate = SMB_RECV_DROP;
-		return 0;
-	}
-
-	req->rq_resp_wct = wct;
-	hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-	VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-
-	req->rq_bytes_recvd = SMB_HEADER_LEN;
-	req->rq_rlen = hdrlen;
-	req->rq_iov[0].iov_base = req->rq_header;
-	req->rq_iov[0].iov_len  = hdrlen;
-	req->rq_iovlen = 1;
-	server->rstate = SMB_RECV_PARAM;
-
-#ifdef SMB_DEBUG_PACKET_SIZE
-	add_recv_stats(smb_len(server->header));
-#endif
-	return 0;
-}
-
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		return result;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		return 0;
-
-	VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-		WVAL(req->rq_header, SMB_HEADER_LEN +
-		     (*(req->rq_header + smb_wct) * 2)));
-
-	result = 0;
-	req->rq_iov[0].iov_base = NULL;
-	req->rq_rlen = 0;
-	if (req->rq_callback)
-		req->rq_callback(req);
-	else if (req->rq_setup_read)
-		result = req->rq_setup_read(req);
-	if (result < 0) {
-		server->rstate = SMB_RECV_DROP;
-		return result;
-	}
-
-	server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-
-	req->rq_bytes_recvd = 0;	// recvd out of the iov
-
-	VERBOSE("rlen: %d\n", req->rq_rlen);
-	if (req->rq_rlen < 0) {
-		PARANOIA("Parameters read beyond end of packet!\n");
-		server->rstate = SMB_RECV_END;
-		return -EIO;
-	}
-	return 0;
-}
-
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-	int result;
-
-	result = smb_receive(server, req);
-	if (result < 0)
-		goto out;
-	if (req->rq_bytes_recvd < req->rq_rlen)
-		goto out;
-	server->rstate = SMB_RECV_END;
-out:
-	VERBOSE("result: %d\n", result);
-	return result;
-}
-
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-	unsigned char *inbuf;
-	unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-	unsigned int data_disp, data_offset, data_count, data_tot;
-	int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-
-	VERBOSE("handling trans2\n");
-
-	inbuf = req->rq_header;
-	data_tot    = WVAL(inbuf, smb_tdrcnt);
-	parm_tot    = WVAL(inbuf, smb_tprcnt);
-	parm_disp   = WVAL(inbuf, smb_prdisp);
-	parm_offset = WVAL(inbuf, smb_proff);
-	parm_count  = WVAL(inbuf, smb_prcnt);
-	data_disp   = WVAL(inbuf, smb_drdisp);
-	data_offset = WVAL(inbuf, smb_droff);
-	data_count  = WVAL(inbuf, smb_drcnt);
-
-	/* Modify offset for the split header/buffer we use */
-	if (data_count || data_offset) {
-		if (unlikely(data_offset < hdrlen))
-			goto out_bad_data;
-		else
-			data_offset -= hdrlen;
-	}
-	if (parm_count || parm_offset) {
-		if (unlikely(parm_offset < hdrlen))
-			goto out_bad_parm;
-		else
-			parm_offset -= hdrlen;
-	}
-
-	if (parm_count == parm_tot && data_count == data_tot) {
-		/*
-		 * This packet has all the trans2 data.
-		 *
-		 * We setup the request so that this will be the common
-		 * case. It may be a server error to not return a
-		 * response that fits.
-		 */
-		VERBOSE("single trans2 response  "
-			"dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-			data_count, parm_count,
-			data_offset, parm_offset);
-		req->rq_ldata = data_count;
-		req->rq_lparm = parm_count;
-		req->rq_data = req->rq_buffer + data_offset;
-		req->rq_parm = req->rq_buffer + parm_offset;
-		if (unlikely(parm_offset + parm_count > req->rq_rlen))
-			goto out_bad_parm;
-		if (unlikely(data_offset + data_count > req->rq_rlen))
-			goto out_bad_data;
-		return 0;
-	}
-
-	VERBOSE("multi trans2 response  "
-		"frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-		req->rq_fragment,
-		data_count, parm_count,
-		data_offset, parm_offset);
-
-	if (!req->rq_fragment) {
-		int buf_len;
-
-		/* We got the first trans2 fragment */
-		req->rq_fragment = 1;
-		req->rq_total_data = data_tot;
-		req->rq_total_parm = parm_tot;
-		req->rq_ldata = 0;
-		req->rq_lparm = 0;
-
-		buf_len = data_tot + parm_tot;
-		if (buf_len > SMB_MAX_PACKET_SIZE)
-			goto out_too_long;
-
-		req->rq_trans2bufsize = buf_len;
-		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-		if (!req->rq_trans2buffer)
-			goto out_no_mem;
-
-		req->rq_parm = req->rq_trans2buffer;
-		req->rq_data = req->rq_trans2buffer + parm_tot;
-	} else if (unlikely(req->rq_total_data < data_tot ||
-			    req->rq_total_parm < parm_tot))
-		goto out_data_grew;
-
-	if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-		     parm_offset + parm_count > req->rq_rlen))
-		goto out_bad_parm;
-	if (unlikely(data_disp + data_count > req->rq_total_data ||
-		     data_offset + data_count > req->rq_rlen))
-		goto out_bad_data;
-
-	inbuf = req->rq_buffer;
-	memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-	memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-
-	req->rq_ldata += data_count;
-	req->rq_lparm += parm_count;
-
-	/*
-	 * Check whether we've received all of the data. Note that
-	 * we use the packet totals -- total lengths might shrink!
-	 */
-	if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-		req->rq_ldata = data_tot;
-		req->rq_lparm = parm_tot;
-		return 0;
-	}
-	return 1;
-
-out_too_long:
-	printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-		data_tot, parm_tot);
-	goto out_EIO;
-out_no_mem:
-	printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-	       req->rq_trans2bufsize);
-	req->rq_errno = -ENOMEM;
-	goto out;
-out_data_grew:
-	printk(KERN_ERR "smb_trans2: data/params grew!\n");
-	goto out_EIO;
-out_bad_parm:
-	printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       parm_disp, parm_count, parm_tot, parm_offset);
-	goto out_EIO;
-out_bad_data:
-	printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-	       data_disp, data_count, data_tot, data_offset);
-out_EIO:
-	req->rq_errno = -EIO;
-out:
-	return req->rq_errno;
-}
-
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-	struct smb_request *req = NULL;
-	int result = 0;
-
-	if (smb_recv_available(server) <= 0)
-		return 0;
-
-	VERBOSE("state: %d\n", server->rstate);
-	switch (server->rstate) {
-	case SMB_RECV_DROP:
-		result = smb_receive_drop(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_DROP)
-			break;
-		server->rstate = SMB_RECV_START;
-		/* fallthrough */
-	case SMB_RECV_START:
-		server->smb_read = 0;
-		server->rstate = SMB_RECV_HEADER;
-		/* fallthrough */
-	case SMB_RECV_HEADER:
-		result = smb_receive_header(server);
-		if (result < 0)
-			break;
-		if (server->rstate == SMB_RECV_HEADER)
-			break;
-		if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-			server->rstate = SMB_RECV_REQUEST;
-			break;
-		}
-		if (server->rstate != SMB_RECV_HCOMPLETE)
-			break;
-		/* fallthrough */
-	case SMB_RECV_HCOMPLETE:
-		req = find_request(server, WVAL(server->header, smb_mid));
-		if (!req)
-			break;
-		smb_init_request(server, req);
-		req->rq_rcls = *(req->rq_header + smb_rcls);
-		req->rq_err  = WVAL(req->rq_header, smb_err);
-		if (server->rstate != SMB_RECV_PARAM)
-			break;
-		/* fallthrough */
-	case SMB_RECV_PARAM:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_param(server, req);
-		if (result < 0)
-			break;
-		if (server->rstate != SMB_RECV_DATA)
-			break;
-		/* fallthrough */
-	case SMB_RECV_DATA:
-		if (!req)
-			req = find_request(server,WVAL(server->header,smb_mid));
-		if (!req)
-			break;
-		result = smb_recv_data(server, req);
-		if (result < 0)
-			break;
-		break;
-
-		/* We should never be called with any of these states */
-	case SMB_RECV_END:
-	case SMB_RECV_REQUEST:
-		BUG();
-	}
-
-	if (result < 0) {
-		/* We saw an error */
-		return result;
-	}
-
-	if (server->rstate != SMB_RECV_END)
-		return 0;
-
-	result = 0;
-	if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-		result = smb_recv_trans2(server, req);
-
-	/*
-	 * Response completely read. Drop any extra bytes sent by the server.
-	 * (Yes, servers sometimes add extra bytes to responses)
-	 */
-	VERBOSE("smb_len: %d   smb_read: %d\n",
-		server->smb_len, server->smb_read);
-	if (server->smb_read < server->smb_len)
-		smb_receive_drop(server);
-
-	server->rstate = SMB_RECV_START;
-
-	if (!result) {
-		list_del_init(&req->rq_queue);
-		req->rq_flags |= SMB_REQ_RECEIVED;
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c..00000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-
-struct smb_request {
-	struct list_head rq_queue;	/* recvq or xmitq for the server */
-
-	atomic_t rq_count;
-
-	wait_queue_head_t rq_wait;
-	int rq_flags;
-	int rq_mid;	/* multiplex ID, set by request.c */
-
-	struct smb_sb_info *rq_server;
-
-	/* header + word count + parameter words + byte count */
-	unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-
-	int rq_bufsize;
-	unsigned char *rq_buffer;
-
-	/* FIXME: this is not good enough for merging IO requests. */
-	unsigned char *rq_page;
-	int rq_rsize;
-
-	int rq_resp_wct;
-	int rq_resp_bcc;
-
-	int rq_rlen;
-	int rq_bytes_recvd;
-
-	int rq_slen;
-	int rq_bytes_sent;
-
-	int rq_iovlen;
-	struct kvec rq_iov[4];
-
-	int (*rq_setup_read) (struct smb_request *);
-	void (*rq_callback) (struct smb_request *);
-
-	/* ------ trans2 stuff ------ */
-
-	u16 rq_trans2_command;	/* 0 if not a trans2 request */
-	unsigned int rq_ldata;
-	unsigned char *rq_data;
-	unsigned int rq_lparm;
-	unsigned char *rq_parm;
-
-	int rq_fragment;
-	u32 rq_total_data;
-	u32 rq_total_parm;
-	int rq_trans2bufsize;
-	unsigned char *rq_trans2buffer;
-
-	/* ------ response ------ */
-
-	unsigned short rq_rcls;
-	unsigned short rq_err;
-	int rq_errno;
-};
-
-#define SMB_REQ_STATIC		0x0001	/* rq_buffer is static */
-#define SMB_REQ_NORETRY		0x0002	/* request is invalid after retry */
-
-#define SMB_REQ_TRANSMITTED	0x4000	/* all data has been sent */
-#define SMB_REQ_RECEIVED	0x8000	/* reply received, smbiod is done */
-
-#define xSMB_REQ_NOREPLY	0x0004	/* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER	0x0008	/* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd75..00000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-	(dentry)->d_parent->d_name.name,(dentry)->d_name.name
-
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10..00000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-
-enum smbiod_state {
-	SMBIOD_DEAD,
-	SMBIOD_STARTING,
-	SMBIOD_RUNNING,
-};
-
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-
-#define SMBIOD_DATA_READY	(1<<0)
-static unsigned long smbiod_flags;
-
-static int smbiod(void *);
-static int smbiod_start(void);
-
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-	if (smbiod_state == SMBIOD_DEAD)
-		return;
-	set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-	wake_up_interruptible(&smbiod_wait);
-}
-
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-	struct task_struct *tsk;
-	int err = 0;
-
-	if (smbiod_state != SMBIOD_DEAD)
-		return 0;
-	smbiod_state = SMBIOD_STARTING;
-	__module_get(THIS_MODULE);
-	spin_unlock(&servers_lock);
-	tsk = kthread_run(smbiod, NULL, "smbiod");
-	if (IS_ERR(tsk)) {
-		err = PTR_ERR(tsk);
-		module_put(THIS_MODULE);
-	}
-
-	spin_lock(&servers_lock);
-	if (err < 0) {
-		smbiod_state = SMBIOD_DEAD;
-		smbiod_thread = NULL;
-	} else {
-		smbiod_state = SMBIOD_RUNNING;
-		smbiod_thread = tsk;
-	}
-	return err;
-}
-
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-	int ret;
-	spin_lock(&servers_lock);
-	list_add(&server->entry, &smb_servers);
-	VERBOSE("%p\n", server);
-	ret = smbiod_start();
-	spin_unlock(&servers_lock);
-	return ret;
-}
-
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-	spin_lock(&servers_lock);
-	list_del_init(&server->entry);
-	VERBOSE("%p\n", server);
-	spin_unlock(&servers_lock);
-
-	smbiod_wake_up();
-	smbiod_flush(server);
-}
-
-void smbiod_flush(struct smb_sb_info *server)
-{
-	struct list_head *tmp, *n;
-	struct smb_request *req;
-
-	list_for_each_safe(tmp, n, &server->xmitq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-	list_for_each_safe(tmp, n, &server->recvq) {
-		req = list_entry(tmp, struct smb_request, rq_queue);
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-}
-
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-	struct list_head *head;
-	struct smb_request *req;
-	struct pid *pid = get_pid(server->conn_pid);
-	int result = 0;
-
-	VERBOSE("state: %d\n", server->state);
-	if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-		goto out;
-
-	smb_invalidate_inodes(server);
-
-	/*
-	 * Some requests are meaningless after a retry, so we abort them.
-	 * One example are all requests using 'fileid' since the files are
-	 * closed on retry.
-	 */
-	head = server->xmitq.next;
-	while (head != &server->xmitq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-
-		req->rq_bytes_sent = 0;
-		if (req->rq_flags & SMB_REQ_NORETRY) {
-			VERBOSE("aborting request %p on xmitq\n", req);
-			req->rq_errno = -EIO;
-			list_del_init(&req->rq_queue);
-			smb_rput(req);
-			wake_up_interruptible(&req->rq_wait);
-		}
-	}
-
-	/*
-	 * FIXME: test the code for retrying request we already sent
-	 */
-	head = server->recvq.next;
-	while (head != &server->recvq) {
-		req = list_entry(head, struct smb_request, rq_queue);
-		head = head->next;
-#if 0
-		if (req->rq_flags & SMB_REQ_RETRY) {
-			/* must move the request to the xmitq */
-			VERBOSE("retrying request %p on recvq\n", req);
-			list_move(&req->rq_queue, &server->xmitq);
-			continue;
-		}
-#endif
-
-		VERBOSE("aborting request %p on recvq\n", req);
-		/* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-		req->rq_errno = -EIO;
-		list_del_init(&req->rq_queue);
-		smb_rput(req);
-		wake_up_interruptible(&req->rq_wait);
-	}
-
-	smb_close_socket(server);
-
-	if (!pid) {
-		/* FIXME: this is fatal, umount? */
-		printk(KERN_ERR "smb_retry: no connection process\n");
-		server->state = CONN_RETRIED;
-		goto out;
-	}
-
-	/*
-	 * Change state so that only one retry per server will be started.
-	 */
-	server->state = CONN_RETRYING;
-
-	/*
-	 * Note: use the "priv" flag, as a user process may need to reconnect.
-	 */
-	result = kill_pid(pid, SIGUSR1, 1);
-	if (result) {
-		/* FIXME: this is most likely fatal, umount? */
-		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-		goto out;
-	}
-	VERBOSE("signalled pid %d\n", pid_nr(pid));
-
-	/* FIXME: The retried requests should perhaps get a "time boost". */
-
-out:
-	put_pid(pid);
-	return result;
-}
-
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-	PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-	server->rstate = SMB_RECV_DROP;
-}
-
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-	int result;
-	int maxwork = 7;
-
-	if (server->state != CONN_VALID)
-		goto out;
-
-	do {
-		result = smb_request_recv(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		} else if (server->rstate == SMB_RECV_REQUEST)
-			smbiod_handle_request(server);
-	} while (result > 0 && maxwork-- > 0);
-
-	/*
-	 * If there is more to read then we want to be sure to wake up again.
-	 */
-	if (server->state != CONN_VALID)
-		goto out;
-	if (smb_recv_available(server) > 0)
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-	do {
-		result = smb_request_send_server(server);
-		if (result < 0) {
-			server->state = CONN_INVALID;
-			smbiod_retry(server);
-			goto out;	/* reconnecting is slow */
-		}
-	} while (result > 0);
-
-	/*
-	 * If the last request was not sent out we want to wake up again.
-	 */
-	if (!list_empty(&server->xmitq))
-		set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-out:
-	return;
-}
-
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-
-	for (;;) {
-		struct smb_sb_info *server;
-		struct list_head *pos, *n;
-
-		/* FIXME: Use poll? */
-		wait_event_interruptible(smbiod_wait,
-			 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-		if (signal_pending(current)) {
-			spin_lock(&servers_lock);
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-
-		spin_lock(&servers_lock);
-		if (list_empty(&smb_servers)) {
-			smbiod_state = SMBIOD_DEAD;
-			spin_unlock(&servers_lock);
-			break;
-		}
-
-		list_for_each_safe(pos, n, &smb_servers) {
-			server = list_entry(pos, struct smb_sb_info, entry);
-			VERBOSE("checking server %p\n", server);
-
-			if (server->state == CONN_VALID) {
-				spin_unlock(&servers_lock);
-
-				smb_lock_server(server);
-				smbiod_doio(server);
-				smb_unlock_server(server);
-
-				spin_lock(&servers_lock);
-			}
-		}
-		spin_unlock(&servers_lock);
-	}
-
-	VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-	module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd..00000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-
-
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-	struct kvec iov = {ubuf, size};
-	struct msghdr msg = {.msg_flags = flags};
-	msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-	return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-	return socket->sk->sk_user_data;
-}
-
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-	struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-	void (*data_ready)(struct sock *, int) = server->data_ready;
-
-	data_ready(sk, len);
-	VERBOSE("(%p, %d)\n", sk, len);
-	smbiod_wake_up();
-}
-
-int
-smb_valid_socket(struct inode * inode)
-{
-	return (inode && S_ISSOCK(inode->i_mode) && 
-		SOCKET_I(inode)->type == SOCK_STREAM);
-}
-
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-	struct file *file;
-
-	if (server && (file = server->sock_file))
-	{
-#ifdef SMBFS_PARANOIA
-		if (!smb_valid_socket(file->f_path.dentry->d_inode))
-			PARANOIA("bad socket!\n");
-#endif
-		return SOCKET_I(file->f_path.dentry->d_inode);
-	}
-	return NULL;
-}
-
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-	struct file * file = server->sock_file;
-
-	if (file) {
-		struct socket *sock = server_sock(server);
-
-		VERBOSE("closing socket %p\n", sock);
-		sock->sk->sk_data_ready = server->data_ready;
-		server->sock_file = NULL;
-		fput(file);
-	}
-}
-
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-	int result;
-
-	result = _recvfrom(socket, header, 4, MSG_PEEK);
-	if (result == -EAGAIN)
-		return -ENODATA;
-	if (result < 0) {
-		PARANOIA("recv error = %d\n", -result);
-		return result;
-	}
-	if (result < 4)
-		return -ENODATA;
-
-	switch (header[0]) {
-	case 0x00:
-	case 0x82:
-		break;
-
-	case 0x85:
-		DEBUG1("Got SESSION KEEP ALIVE\n");
-		_recvfrom(socket, header, 4, 0);	/* read away */
-		return -ENODATA;
-
-	default:
-		PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-		return -EIO;
-	}
-
-	/* The length in the RFC NB header is the raw data length */
-	return smb_len(header);
-}
-
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-	mm_segment_t oldfs;
-	int avail, err;
-	struct socket *sock = server_sock(server);
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-	set_fs(oldfs);
-	return (err >= 0) ? avail : err;
-}
-
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-	struct kvec *iv = *data;
-	int i;
-	int len;
-
-	/*
-	 *	Eat any sent kvecs
-	 */
-	while (iv->iov_len <= amount) {
-		amount -= iv->iov_len;
-		iv++;
-		(*num)--;
-	}
-
-	/*
-	 *	And chew down the partial one
-	 */
-	vec[0].iov_len = iv->iov_len-amount;
-	vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-	iv++;
-
-	len = vec[0].iov_len;
-
-	/*
-	 *	And copy any others
-	 */
-	for (i = 1; i < *num; i++) {
-		vec[i] = *iv++;
-		len += vec[i].iov_len;
-	}
-
-	*data = vec;
-	return len;
-}
-
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	int result = 0;
-	unsigned char peek_buf[4];
-
-	result = -EIO; 
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	if (!server->smb_read) {
-		result = smb_get_length(sock, peek_buf);
-		if (result < 0) {
-			if (result == -ENODATA)
-				result = 0;
-			goto out;
-		}
-		server->smb_len = result + 4;
-
-		if (server->smb_len < SMB_HEADER_LEN) {
-			PARANOIA("short packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-		if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-			PARANOIA("long packet: %d\n", result);
-			server->rstate = SMB_RECV_DROP;
-			result = -EIO;
-			goto out;
-		}
-	}
-
-	result = _recvfrom(sock, server->header + server->smb_read,
-			   SMB_HEADER_LEN - server->smb_read, 0);
-	VERBOSE("_recvfrom: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read == SMB_HEADER_LEN)
-		server->rstate = SMB_RECV_HCOMPLETE;
-out:
-	return result;
-}
-
-static char drop_buffer[PAGE_SIZE];
-
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov;
-	struct msghdr msg;
-	int rlen = smb_len(server->header) - server->smb_read + 4;
-	int result = -EIO;
-
-	if (rlen > PAGE_SIZE)
-		rlen = PAGE_SIZE;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	iov.iov_base = drop_buffer;
-	iov.iov_len = PAGE_SIZE;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	server->smb_read += result;
-
-	if (server->smb_read >= server->smb_len)
-		server->rstate = SMB_RECV_END;
-
-out:
-	return result;
-}
-
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-	struct socket *sock;
-	unsigned int flags;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-	struct msghdr msg;
-	int rlen;
-	int result = -EIO;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-	msg.msg_flags = flags;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-
-	/* Dont repeat bytes and count available bufferspace */
-	rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-			(req->rq_rlen - req->rq_bytes_recvd));
-
-	result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-
-	VERBOSE("read: %d\n", result);
-	if (result < 0) {
-		VERBOSE("receive error: %d\n", result);
-		goto out;
-	}
-	req->rq_bytes_recvd += result;
-	server->smb_read += result;
-
-out:
-	return result;
-}
-
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-	struct smb_sb_info *server = req->rq_server;
-	struct socket *sock;
-	struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-	int result = -EIO;
-	struct kvec iov[4];
-	struct kvec *p = req->rq_iov;
-	size_t num = req->rq_iovlen;
-
-	sock = server_sock(server);
-	if (!sock)
-		goto out;
-	if (sock->sk->sk_state != TCP_ESTABLISHED)
-		goto out;
-
-	/* Dont repeat bytes */
-	if (req->rq_bytes_sent)
-		smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-
-	result = kernel_sendmsg(sock, &msg, p, num, slen);
-
-	if (result >= 0) {
-		req->rq_bytes_sent += result;
-		if (req->rq_bytes_sent >= req->rq_slen)
-			req->rq_flags |= SMB_REQ_TRANSMITTED;
-	}
-out:
-	return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd46..00000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-
-#include "smb_debug.h"
-#include "proto.h"
-
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-	DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-
-	return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *link = __getname();
-	DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-
-	if (!link) {
-		link = ERR_PTR(-ENOMEM);
-	} else {
-		int len = smb_proc_read_link(server_from_dentry(dentry),
-						dentry, link, PATH_MAX - 1);
-		if (len < 0) {
-			__putname(link);
-			link = ERR_PTR(len);
-		} else {
-			link[len] = 0;
-		}
-	}
-	nd_set_link(nd, link);
-	return NULL;
-}
-
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		__putname(s);
-}
-
-const struct inode_operations smb_link_inode_operations =
-{
-	.readlink	= generic_readlink,
-	.follow_link	= smb_follow_link,
-	.put_link	= smb_put_link,
-};
diff --git a/fs/splice.c b/fs/splice.c
index efdbfece993..8f1dfaecc8f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -399,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
-			/*
-			 * If in nonblock mode then dont block on waiting
-			 * for an in-flight io page
-			 */
-			if (flags & SPLICE_F_NONBLOCK) {
-				if (!trylock_page(page)) {
-					error = -EAGAIN;
-					break;
-				}
-			} else
-				lock_page(page);
+			lock_page(page);
 
 			/*
 			 * Page was truncated, or invalidated by the
@@ -597,7 +587,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	struct page *pages[PIPE_DEF_BUFFERS];
 	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
-	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
 	int error;
@@ -621,7 +610,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 			goto shrink_ret;
 	}
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index cc6ce8a84c2..e5f63da64d0 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
 	help
 	  Saying Y here includes support for SquashFS 4.0 (a Compressed
 	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib compression to compress both
+	  filesystem for Linux.  It uses zlib/lzo compression to compress both
 	  files, inodes and directories.  Inodes in the system are very small
 	  and all blocks are packed to minimise data overhead. Block sizes
 	  greater than 4K are supported up to a maximum of 1 Mbytes (default
 	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
 	  (larger than 4GB), full uid/gid information, hard links and
-	  timestamps.  
+	  timestamps.
 
 	  Squashfs is intended for general read-only filesystem use, for
 	  archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,7 +26,7 @@ config SQUASHFS
 
 	  If unsure, say N.
 
-config SQUASHFS_XATTRS
+config SQUASHFS_XATTR
 	bool "Squashfs XATTR support"
 	depends on SQUASHFS
 	default n
@@ -37,9 +37,24 @@ config SQUASHFS_XATTRS
 
 	  If unsure, say N.
 
-config SQUASHFS_EMBEDDED
+config SQUASHFS_LZO
+	bool "Include support for LZO compressed file systems"
+	depends on SQUASHFS
+	default n
+	select LZO_DECOMPRESS
+	help
+	  Saying Y here includes support for reading Squashfs file systems
+	  compressed with LZO compresssion.  LZO compression is mainly
+	  aimed at embedded systems with slower CPUs where the overheads
+	  of zlib are too high.
 
-	bool "Additional option for memory-constrained systems" 
+	  LZO is not the standard compression used in Squashfs and so most
+	  file systems will be readable without selecting this option.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+	bool "Additional option for memory-constrained systems"
 	depends on SQUASHFS
 	default n
 	help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 2cee3e9fa45..7672bac8d32 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
-squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
-
+squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac..24af9ce9722 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
 	NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
 };
 
+#ifndef CONFIG_SQUASHFS_LZO
 static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
 	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
+#endif
 
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 	NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 static const struct squashfs_decompressor *decompressor[] = {
 	&squashfs_zlib_comp_ops,
 	&squashfs_lzma_unsupported_comp_ops,
+#ifdef CONFIG_SQUASHFS_LZO
+	&squashfs_lzo_comp_ops,
+#else
 	&squashfs_lzo_unsupported_comp_ops,
+#endif
 	&squashfs_unknown_comp_ops
 };
 
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac658..0dc340aa2be 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
 
 const struct file_operations squashfs_dir_ops = {
 	.read = generic_read_dir,
-	.readdir = squashfs_readdir
+	.readdir = squashfs_readdir,
+	.llseek = default_llseek,
 };
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 00000000000..5d87789bf1c
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010 LG Electronics
+ * Chan Jeong <chan.jeong@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * lzo_wrapper.c
+ */
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lzo.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_lzo {
+	void	*input;
+	void	*output;
+};
+
+static void *lzo_init(struct squashfs_sb_info *msblk)
+{
+	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+	struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->input = vmalloc(block_size);
+	if (stream->input == NULL)
+		goto failed;
+	stream->output = vmalloc(block_size);
+	if (stream->output == NULL)
+		goto failed2;
+
+	return stream;
+
+failed2:
+	vfree(stream->input);
+failed:
+	ERROR("Failed to allocate lzo workspace\n");
+	kfree(stream);
+	return NULL;
+}
+
+
+static void lzo_free(void *strm)
+{
+	struct squashfs_lzo *stream = strm;
+
+	if (stream) {
+		vfree(stream->input);
+		vfree(stream->output);
+	}
+	kfree(stream);
+}
+
+
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	struct squashfs_lzo *stream = msblk->stream;
+	void *buff = stream->input;
+	int avail, i, bytes = length, res;
+	size_t out_len = srclength;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	for (i = 0; i < b; i++) {
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			goto block_release;
+
+		avail = min(bytes, msblk->devblksize - offset);
+		memcpy(buff, bh[i]->b_data + offset, avail);
+		buff += avail;
+		bytes -= avail;
+		offset = 0;
+		put_bh(bh[i]);
+	}
+
+	res = lzo1x_decompress_safe(stream->input, (size_t)length,
+					stream->output, &out_len);
+	if (res != LZO_E_OK)
+		goto failed;
+
+	res = bytes = (int)out_len;
+	for (i = 0, buff = stream->output; bytes && i < pages; i++) {
+		avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+		memcpy(buffer[i], buff, avail);
+		buff += avail;
+		bytes -= avail;
+	}
+
+	mutex_unlock(&msblk->read_data_mutex);
+	return res;
+
+block_release:
+	for (; i < b; i++)
+		put_bh(bh[i]);
+
+failed:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	ERROR("lzo decompression failed, data probably corrupt\n");
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+	.init = lzo_init,
+	.free = lzo_free,
+	.decompress = lzo_uncompress,
+	.id = LZO_COMPRESSION,
+	.name = "lzo",
+	.supported = 1
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 733a17c4294..5d45569d5f7 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -104,3 +104,6 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
+
+/* lzo_wrapper.c */
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 8eabb808b78..c5137fc9ab1 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -274,7 +274,7 @@ struct squashfs_base_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 };
 
 struct squashfs_ipc_inode {
@@ -283,7 +283,7 @@ struct squashfs_ipc_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 };
 
@@ -293,7 +293,7 @@ struct squashfs_lipc_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 	__le32			xattr;
 };
@@ -304,7 +304,7 @@ struct squashfs_dev_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 	__le32			rdev;
 };
@@ -315,7 +315,7 @@ struct squashfs_ldev_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 	__le32			rdev;
 	__le32			xattr;
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 	__le32			symlink_size;
 	char			symlink[0];
@@ -339,7 +339,7 @@ struct squashfs_reg_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			start_block;
 	__le32			fragment;
 	__le32			offset;
@@ -353,7 +353,7 @@ struct squashfs_lreg_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le64			start_block;
 	__le64			file_size;
 	__le64			sparse;
@@ -370,7 +370,7 @@ struct squashfs_dir_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			start_block;
 	__le32			nlink;
 	__le16			file_size;
@@ -384,7 +384,7 @@ struct squashfs_ldir_inode {
 	__le16			uid;
 	__le16			guid;
 	__le32			mtime;
-	__le32	 		inode_number;
+	__le32			inode_number;
 	__le32			nlink;
 	__le32			file_size;
 	__le32			start_block;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f860665..24de30ba34c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 
 static void squashfs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	if (sb->s_fs_info) {
 		struct squashfs_sb_info *sbi = sb->s_fs_info;
 		squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
-
-	unlock_kernel();
 }
 
 
-static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
-				const char *dev_name, void *data,
-				struct vfsmount *mnt)
+static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
-				mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
 }
 
 
@@ -456,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
 static struct file_system_type squashfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "squashfs",
-	.get_sb = squashfs_get_sb,
+	.mount = squashfs_mount,
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV
 };
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index c7655e8b31c..3876c36699a 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -18,7 +18,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
- * xattr_id.c
+ * xattr.c
  */
 
 #include <linux/init.h>
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
 					strncmp(target, name, name_size) == 0) {
 			/* found xattr */
 			if (type & SQUASHFS_XATTR_VALUE_OOL) {
-				__le64 xattr;
+				__le64 xattr_val;
+				u64 xattr;
 				/* val is a reference to the real location */
 				err = squashfs_read_metadata(sb, &val, &start,
 						&offset, sizeof(val));
 				if (err < 0)
 					goto failed;
-				err = squashfs_read_metadata(sb, &xattr, &start,
-					 &offset, sizeof(xattr));
+				err = squashfs_read_metadata(sb, &xattr_val,
+					&start, &offset, sizeof(xattr_val));
 				if (err < 0)
 					goto failed;
-				xattr = le64_to_cpu(xattr);
+				xattr = le64_to_cpu(xattr_val);
 				start = SQUASHFS_XATTR_BLK(xattr) +
 							msblk->xattr_table;
 				offset = SQUASHFS_XATTR_OFFSET(xattr);
@@ -295,7 +296,7 @@ static const struct xattr_handler squashfs_xattr_security_handler = {
 	.get	= squashfs_security_get
 };
 
-static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+static const struct xattr_handler *squashfs_xattr_handler(int type)
 {
 	if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
 		/* ignore unrecognised type */
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 9da071ae181..b634efce4bd 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -21,11 +21,11 @@
  * xattr.h
  */
 
-#ifdef CONFIG_SQUASHFS_XATTRS
+#ifdef CONFIG_SQUASHFS_XATTR
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
 		u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
-		int *, unsigned long long *);
+		unsigned int *, unsigned long long *);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 		u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 }
 
 static inline int squashfs_xattr_lookup(struct super_block *sb,
-		unsigned int index, int *count, int *size,
+		unsigned int index, int *count, unsigned int *size,
 		unsigned long long *xattr)
 {
 	return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098..d33be5dd6c3 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Map xattr id using the xattr id look up table
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c573..12e90e21390 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 }
 EXPORT_SYMBOL(vfs_fstat);
 
-int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+		int flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
 }
 EXPORT_SYMBOL(vfs_fstatat);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
 EXPORT_SYMBOL(vfs_stat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
 }
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
 	return cp_old_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 	return cp_new_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
 	return error;
 }
 
-SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	return error;
 }
 
-SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
diff --git a/fs/statfs.c b/fs/statfs.c
index 4ef021f3b61..30ea8c8a996 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -2,38 +2,83 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
 
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+static int flags_by_mnt(int mnt_flags)
 {
-	int retval = -ENODEV;
-
-	if (dentry) {
-		retval = -ENOSYS;
-		if (dentry->d_sb->s_op->statfs) {
-			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(dentry);
-			if (retval)
-				return retval;
-			retval = dentry->d_sb->s_op->statfs(dentry, buf);
-			if (retval == 0 && buf->f_frsize == 0)
-				buf->f_frsize = buf->f_bsize;
-		}
-	}
+	int flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		flags |= ST_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		flags |= ST_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		flags |= ST_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		flags |= ST_NOEXEC;
+	if (mnt_flags & MNT_NOATIME)
+		flags |= ST_NOATIME;
+	if (mnt_flags & MNT_NODIRATIME)
+		flags |= ST_NODIRATIME;
+	if (mnt_flags & MNT_RELATIME)
+		flags |= ST_RELATIME;
+	return flags;
+}
+
+static int flags_by_sb(int s_flags)
+{
+	int flags = 0;
+	if (s_flags & MS_SYNCHRONOUS)
+		flags |= ST_SYNCHRONOUS;
+	if (s_flags & MS_MANDLOCK)
+		flags |= ST_MANDLOCK;
+	return flags;
+}
+
+static int calculate_f_flags(struct vfsmount *mnt)
+{
+	return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
+		flags_by_sb(mnt->mnt_sb->s_flags);
+}
+
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	if (!dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+
+	memset(buf, 0, sizeof(*buf));
+	retval = security_sb_statfs(dentry);
+	if (retval)
+		return retval;
+	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	if (retval == 0 && buf->f_frsize == 0)
+		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
 
+int vfs_statfs(struct path *path, struct kstatfs *buf)
+{
+	int error;
+
+	error = statfs_by_dentry(path->dentry, buf);
+	if (!error)
+		buf->f_flags = calculate_f_flags(path->mnt);
+	return error;
+}
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+static int do_statfs_native(struct path *path, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(dentry, &st);
+	retval = vfs_statfs(path, &st);
 	if (retval)
 		return retval;
 
@@ -67,17 +112,18 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 		buf->f_fsid = st.f_fsid;
 		buf->f_namelen = st.f_namelen;
 		buf->f_frsize = st.f_frsize;
+		buf->f_flags = st.f_flags;
 		memset(buf->f_spare, 0, sizeof(buf->f_spare));
 	}
 	return 0;
 }
 
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+static int do_statfs64(struct path *path, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(dentry, &st);
+	retval = vfs_statfs(path, &st);
 	if (retval)
 		return retval;
 
@@ -94,6 +140,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 		buf->f_fsid = st.f_fsid;
 		buf->f_namelen = st.f_namelen;
 		buf->f_frsize = st.f_frsize;
+		buf->f_flags = st.f_flags;
 		memset(buf->f_spare, 0, sizeof(buf->f_spare));
 	}
 	return 0;
@@ -107,7 +154,7 @@ SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, b
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(path.dentry, &tmp);
+		error = do_statfs_native(&path, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_put(&path);
@@ -125,7 +172,7 @@ SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct stat
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(path.dentry, &tmp);
+		error = do_statfs64(&path, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_put(&path);
@@ -143,7 +190,7 @@ SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_path.dentry, &tmp);
+	error = do_statfs_native(&file->f_path, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -164,7 +211,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_path.dentry, &tmp);
+	error = do_statfs64(&file->f_path, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -183,7 +230,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 	if (!s)
 		return -EINVAL;
 
-	err = vfs_statfs(s->s_root, &sbuf);
+	err = statfs_by_dentry(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		return err;
diff --git a/fs/super.c b/fs/super.c
index 938119ab8dc..ca696155cd9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 			s = NULL;
 			goto out;
 		}
+#ifdef CONFIG_SMP
+		s->s_files = alloc_percpu(struct list_head);
+		if (!s->s_files) {
+			security_sb_free(s);
+			kfree(s);
+			s = NULL;
+			goto out;
+		} else {
+			int i;
+
+			for_each_possible_cpu(i)
+				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+		}
+#else
 		INIT_LIST_HEAD(&s->s_files);
+#endif
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
@@ -255,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
 		get_fs_excl();
 		sb->s_flags &= ~MS_ACTIVE;
 
-		/* bad name - it should be evict_inodes() */
-		invalidate_inodes(sb);
+		fsnotify_unmount_inodes(&sb->s_inodes);
+
+		evict_inodes(sb);
 
 		if (sop->put_super)
 			sop->put_super(sb);
 
-		/* Forget any remaining inodes */
-		if (invalidate_inodes(sb)) {
+		if (!list_empty(&sb->s_inodes)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
@@ -305,8 +323,13 @@ retry:
 			if (s) {
 				up_write(&s->s_umount);
 				destroy_super(s);
+				s = NULL;
 			}
 			down_write(&old->s_umount);
+			if (unlikely(!(old->s_flags & MS_BORN))) {
+				deactivate_locked_super(old);
+				goto retry;
+			}
 			return old;
 		}
 	}
@@ -358,10 +381,10 @@ EXPORT_SYMBOL(drop_super);
  */
 void sync_supers(void)
 {
-	struct super_block *sb, *n;
+	struct super_block *sb, *p = NULL;
 
 	spin_lock(&sb_lock);
-	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (list_empty(&sb->s_instances))
 			continue;
 		if (sb->s_op->write_super && sb->s_dirt) {
@@ -374,11 +397,13 @@ void sync_supers(void)
 			up_read(&sb->s_umount);
 
 			spin_lock(&sb_lock);
-			/* lock was dropped, must reset next */
-			list_safe_reset_next(sb, n, s_list);
-			__put_super(sb);
+			if (p)
+				__put_super(p);
+			p = sb;
 		}
 	}
+	if (p)
+		__put_super(p);
 	spin_unlock(&sb_lock);
 }
 
@@ -392,10 +417,10 @@ void sync_supers(void)
  */
 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 {
-	struct super_block *sb, *n;
+	struct super_block *sb, *p = NULL;
 
 	spin_lock(&sb_lock);
-	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (list_empty(&sb->s_instances))
 			continue;
 		sb->s_count++;
@@ -407,10 +432,12 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 		up_read(&sb->s_umount);
 
 		spin_lock(&sb_lock);
-		/* lock was dropped, must reset next */
-		list_safe_reset_next(sb, n, s_list);
-		__put_super(sb);
+		if (p)
+			__put_super(p);
+		p = sb;
 	}
+	if (p)
+		__put_super(p);
 	spin_unlock(&sb_lock);
 }
 
@@ -572,10 +599,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 
 static void do_emergency_remount(struct work_struct *work)
 {
-	struct super_block *sb, *n;
+	struct super_block *sb, *p = NULL;
 
 	spin_lock(&sb_lock);
-	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (list_empty(&sb->s_instances))
 			continue;
 		sb->s_count++;
@@ -589,10 +616,12 @@ static void do_emergency_remount(struct work_struct *work)
 		}
 		up_write(&sb->s_umount);
 		spin_lock(&sb_lock);
-		/* lock was dropped, must reset next */
-		list_safe_reset_next(sb, n, s_list);
-		__put_super(sb);
+		if (p)
+			__put_super(p);
+		p = sb;
 	}
+	if (p)
+		__put_super(p);
 	spin_unlock(&sb_lock);
 	kfree(work);
 	printk("Emergency Remount complete\n");
@@ -686,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
+	void *data, int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *sb;
 
 	sb = sget(fs_type, ns_test_super, ns_set_super, data);
 	if (IS_ERR(sb))
-		return PTR_ERR(sb);
+		return ERR_CAST(sb);
 
 	if (!sb->s_root) {
 		int err;
@@ -702,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
 		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (err) {
 			deactivate_locked_super(sb);
-			return err;
+			return ERR_PTR(err);
 		}
 
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 }
 
-EXPORT_SYMBOL(get_sb_ns);
+EXPORT_SYMBOL(mount_ns);
 
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
@@ -733,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
-int get_sb_bdev(struct file_system_type *fs_type,
+struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -748,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+		return ERR_CAST(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -773,7 +799,16 @@ int get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  close_bdev_exclusive()
+		 * acquires bd_mutex and can't be called under
+		 * s_umount.  Drop s_umount temporarily.  This is safe
+		 * as we're holding an active reference.
+		 */
+		up_write(&s->s_umount);
 		close_bdev_exclusive(bdev, mode);
+		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -791,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
 	close_bdev_exclusive(bdev, mode);
 error:
-	return error;
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+
+int get_sb_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
+
+	root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -818,29 +868,42 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
 #endif
 
-int get_sb_nodev(struct file_system_type *fs_type,
+struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 
 	s->s_flags = flags;
 
 	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		deactivate_locked_super(s);
-		return error;
+		return ERR_PTR(error);
 	}
 	s->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, s);
-	return 0;
+	return dget(s->s_root);
 }
+EXPORT_SYMBOL(mount_nodev);
+
+int get_sb_nodev(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
 
+	root = mount_nodev(fs_type, flags, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	return 0;
+}
 EXPORT_SYMBOL(get_sb_nodev);
 
 static int compare_single(struct super_block *s, void *p)
@@ -848,29 +911,42 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *mount_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return PTR_ERR(s);
+		return ERR_CAST(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
-			return error;
+			return ERR_PTR(error);
 		}
 		s->s_flags |= MS_ACTIVE;
 	} else {
 		do_remount_sb(s, flags, data, 0);
 	}
-	simple_set_mnt(mnt, s);
+	return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+
+int get_sb_single(struct file_system_type *fs_type,
+	int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct dentry *root;
+	root = mount_single(fs_type, flags, data, fill_super);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
 	return 0;
 }
 
@@ -880,6 +956,7 @@ struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
 	struct vfsmount *mnt;
+	struct dentry *root;
 	char *secdata = NULL;
 	int error;
 
@@ -904,11 +981,22 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 			goto out_free_secdata;
 	}
 
-	error = type->get_sb(type, flags, name, data, mnt);
-	if (error < 0)
-		goto out_free_secdata;
+	if (type->mount) {
+		root = type->mount(type, flags, name, data);
+		if (IS_ERR(root)) {
+			error = PTR_ERR(root);
+			goto out_free_secdata;
+		}
+		mnt->mnt_root = root;
+		mnt->mnt_sb = root->d_sb;
+	} else {
+		error = type->get_sb(type, flags, name, data, mnt);
+		if (error < 0)
+			goto out_free_secdata;
+	}
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
+	mnt->mnt_sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
 	if (error)
diff --git a/fs/sync.c b/fs/sync.c
index 15aa6f03b2d..ba76b9623e7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -128,31 +128,6 @@ void emergency_sync(void)
 	}
 }
 
-/*
- * Generic function to fsync a file.
- */
-int file_fsync(struct file *filp, int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	struct super_block * sb;
-	int ret, err;
-
-	/* sync the inode to buffers */
-	ret = write_inode_now(inode, 0);
-
-	/* sync the superblock to buffers */
-	sb = inode->i_sb;
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-
-	/* .. finally sync the buffers to disk */
-	err = sync_blockdev(sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-EXPORT_SYMBOL(file_fsync);
-
 /**
  * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353f..a4759833d62 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
 	struct bin_buffer *bb = file->private_data;
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 
-	if (!bb->vm_ops || !bb->vm_ops->open)
-		return;
-
-	if (!sysfs_get_active(attr_sd))
-		return;
-
-	bb->vm_ops->open(vma);
-
-	sysfs_put_active(attr_sd);
-}
-
-static void bin_vma_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops || !bb->vm_ops->close)
+	if (!bb->vm_ops)
 		return;
 
 	if (!sysfs_get_active(attr_sd))
 		return;
 
-	bb->vm_ops->close(vma);
+	if (bb->vm_ops->open)
+		bb->vm_ops->open(vma);
 
 	sysfs_put_active(attr_sd);
 }
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->fault)
+	if (!bb->vm_ops)
 		return VM_FAULT_SIGBUS;
 
 	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
-	ret = bb->vm_ops->fault(vma, vmf);
+	ret = VM_FAULT_SIGBUS;
+	if (bb->vm_ops->fault)
+		ret = bb->vm_ops->fault(vma, vmf);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!bb->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!bb->vm_ops->page_mkwrite)
-		return 0;
-
 	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
-	ret = bb->vm_ops->page_mkwrite(vma, vmf);
+	ret = 0;
+	if (bb->vm_ops->page_mkwrite)
+		ret = bb->vm_ops->page_mkwrite(vma, vmf);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->access)
+	if (!bb->vm_ops)
 		return -EINVAL;
 
 	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
-	ret = bb->vm_ops->access(vma, addr, buf, len, write);
+	ret = -EINVAL;
+	if (bb->vm_ops->access)
+		ret = bb->vm_ops->access(vma, addr, buf, len, write);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->set_policy)
+	if (!bb->vm_ops)
 		return 0;
 
 	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
-	ret = bb->vm_ops->set_policy(vma, new);
+	ret = 0;
+	if (bb->vm_ops->set_policy)
+		ret = bb->vm_ops->set_policy(vma, new);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct mempolicy *pol;
 
-	if (!bb->vm_ops || !bb->vm_ops->get_policy)
+	if (!bb->vm_ops)
 		return vma->vm_policy;
 
 	if (!sysfs_get_active(attr_sd))
 		return vma->vm_policy;
 
-	pol = bb->vm_ops->get_policy(vma, addr);
+	pol = vma->vm_policy;
+	if (bb->vm_ops->get_policy)
+		pol = bb->vm_ops->get_policy(vma, addr);
 
 	sysfs_put_active(attr_sd);
 	return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	int ret;
 
-	if (!bb->vm_ops || !bb->vm_ops->migrate)
+	if (!bb->vm_ops)
 		return 0;
 
 	if (!sysfs_get_active(attr_sd))
 		return 0;
 
-	ret = bb->vm_ops->migrate(vma, from, to, flags);
+	ret = 0;
+	if (bb->vm_ops->migrate)
+		ret = bb->vm_ops->migrate(vma, from, to, flags);
 
 	sysfs_put_active(attr_sd);
 	return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 
 static const struct vm_operations_struct bin_vm_ops = {
 	.open		= bin_vma_open,
-	.close		= bin_vma_close,
 	.fault		= bin_fault,
 	.page_mkwrite	= bin_page_mkwrite,
 	.access		= bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
 		goto out_put;
 
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
 	rc = 0;
 	bb->mmapped = 1;
 	bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a..da3fefe91a8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	char *p;
 
 	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-	if (p)
+	if (!IS_ERR(p))
 		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  * @mode: file permissions.
  *
  */
-int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+		     mode_t mode)
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792..442f34ff1af 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
 	sysfs_put(sd);
 }
 
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	int error = 0;
+	struct attribute *const *attr;
+	int i;
+
+	if (grp)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+	if (!dir_sd)
+		return -ENOENT;
+
+	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+	if (error) {
+		while (--i >= 0)
+			sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+	}
+	sysfs_put(dir_sd);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+		       const struct attribute_group *grp)
+{
+	struct sysfs_dirent *dir_sd;
+	struct attribute *const *attr;
+
+	if (grp)
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+	if (dir_sd) {
+		for (attr = grp->attrs; *attr; ++attr)
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+		sysfs_put(dir_sd);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+
 
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0835a3b70e0..cffb1fd8ba3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -122,7 +122,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		goto out;
 
 	/* this ignores size changes */
-	generic_setattr(inode, iattr);
+	setattr_copy(inode, iattr);
 
 out:
 	mutex_unlock(&sysfs_mutex);
@@ -312,15 +312,15 @@ struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
  * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
  * To prevent the sysfs inode numbers from being freed prematurely we take a
  * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.delete_inode() implementation is needed to drop that
+ * super_operations.evict_inode() implementation is needed to drop that
  * reference upon inode destruction.
  */
-void sysfs_delete_inode(struct inode *inode)
+void sysfs_evict_inode(struct inode *inode)
 {
 	struct sysfs_dirent *sd  = inode->i_private;
 
 	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
+	end_writeback(inode);
 	sysfs_put(sd);
 }
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 281c0c9bc39..266895783b4 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,13 +23,13 @@
 #include "sysfs.h"
 
 
-static struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.delete_inode	= sysfs_delete_inode,
+	.evict_inode	= sysfs_evict_inode,
 };
 
 struct sysfs_dirent sysfs_root = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 	return error;
 }
 
-static int sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
 	struct sysfs_super_info *info;
 	enum kobj_ns_type type;
 	struct super_block *sb;
 	int error;
 
-	error = -ENOMEM;
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
-	if (IS_ERR(sb)) {
-		error = PTR_ERR(sb);
-		goto out;
-	}
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
 	if (!sb->s_root) {
 		sb->s_flags = flags;
 		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(sb);
-			goto out;
+			return ERR_PTR(error);
 		}
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, sb);
-	error = 0;
-out:
-	return error;
+	return dget(sb->s_root);
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
-	.get_sb		= sysfs_get_sb,
+	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
 };
 
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
 
 	err = register_filesystem(&sysfs_fs_type);
 	if (!err) {
-		sysfs_mount = kern_mount(&sysfs_fs_type);
-		if (IS_ERR(sysfs_mount)) {
+		sysfs_mnt = kern_mount(&sysfs_fs_type);
+		if (IS_ERR(sysfs_mnt)) {
 			printk(KERN_ERR "sysfs: could not mount!\n");
-			err = PTR_ERR(sysfs_mount);
-			sysfs_mount = NULL;
+			err = PTR_ERR(sysfs_mnt);
+			sysfs_mnt = NULL;
 			unregister_filesystem(&sysfs_fs_type);
 			goto out_err;
 		}
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6a13105b559..d9be60a2e95 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -198,7 +198,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
  * inode.c
  */
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_delete_inode(struct inode *inode);
+void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
 int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 79941e4964a..a77c4215762 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -218,8 +218,7 @@ got_it:
 	pos = page_offset(page) +
 			(char*)de - (char*)page_address(page);
 	lock_page(page);
-	err = __sysv_write_begin(NULL, page->mapping, pos, SYSV_DIRSIZE,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
 	if (err)
 		goto out_unlock;
 	memcpy (de->name, name, namelen);
@@ -239,15 +238,13 @@ out_unlock:
 
 int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = (struct inode*)mapping->host;
+	struct inode *inode = page->mapping->host;
 	char *kaddr = (char*)page_address(page);
 	loff_t pos = page_offset(page) + (char *)de - kaddr;
 	int err;
 
 	lock_page(page);
-	err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
 	BUG_ON(err);
 	de->inode = 0;
 	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
@@ -259,16 +256,14 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 
 int sysv_make_empty(struct inode *inode, struct inode *dir)
 {
-	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
+	struct page *page = grab_cache_page(inode->i_mapping, 0);
 	struct sysv_dir_entry * de;
 	char *base;
 	int err;
 
 	if (!page)
 		return -ENOMEM;
-	err = __sysv_write_begin(NULL, mapping, 0, 2 * SYSV_DIRSIZE,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
 	if (err) {
 		unlock_page(page);
 		goto fail;
@@ -341,15 +336,13 @@ not_empty:
 void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
 	struct inode *inode)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *dir = mapping->host;
+	struct inode *dir = page->mapping->host;
 	loff_t pos = page_offset(page) +
 			(char *)de-(char*)page_address(page);
 	int err;
 
 	lock_page(page);
-	err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
 	BUG_ON(err);
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
 	err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 750cc22349b..0a65939508e 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -30,7 +30,29 @@ const struct file_operations sysv_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
+static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
 const struct inode_operations sysv_file_inode_operations = {
 	.truncate	= sysv_truncate,
+	.setattr	= sysv_setattr,
 	.getattr	= sysv_getattr,
 };
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index fcc498ec9b3..0c96c98bd1d 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -113,7 +113,6 @@ void sysv_free_inode(struct inode * inode)
 		return;
 	}
 	raw_inode = sysv_raw_inode(sb, ino, &bh);
-	clear_inode(inode);
 	if (!raw_inode) {
 		printk("sysv_free_inode: unable to read inode block on device "
 		       "%s\n", inode->i_sb->s_id);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d4a5380b566..de44d067b9e 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -71,8 +71,8 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 	lock_super(sb);
 	if (sbi->s_forced_ro)
 		*flags |= MS_RDONLY;
-	if (!(*flags & MS_RDONLY))
-		sb->s_dirt = 1;
+	if (*flags & MS_RDONLY)
+		sysv_write_super(sb);
 	unlock_super(sb);
 	return 0;
 }
@@ -308,12 +308,17 @@ int sysv_sync_inode(struct inode *inode)
 	return __sysv_write_inode(inode, 1);
 }
 
-static void sysv_delete_inode(struct inode *inode)
+static void sysv_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
-	inode->i_size = 0;
-	sysv_truncate(inode);
-	sysv_free_inode(inode);
+	if (!inode->i_nlink) {
+		inode->i_size = 0;
+		sysv_truncate(inode);
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+	if (!inode->i_nlink)
+		sysv_free_inode(inode);
 }
 
 static struct kmem_cache *sysv_inode_cachep;
@@ -344,7 +349,7 @@ const struct super_operations sysv_sops = {
 	.alloc_inode	= sysv_alloc_inode,
 	.destroy_inode	= sysv_destroy_inode,
 	.write_inode	= sysv_write_inode,
-	.delete_inode	= sysv_delete_inode,
+	.evict_inode	= sysv_evict_inode,
 	.put_super	= sysv_put_super,
 	.write_super	= sysv_write_super,
 	.sync_fs	= sysv_sync_fs,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index f042eec464c..9ca66276315 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -459,20 +459,25 @@ static int sysv_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page,get_block);
 }
 
-int __sysv_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
+int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				get_block);
+	return __block_write_begin(page, pos, len, get_block);
 }
 
 static int sysv_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return __sysv_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8..11e7f7d11cd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	return add_nondir(dentry, inode);
 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 5a903da5455..3d9c62be0c1 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -347,7 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 		sb->s_flags |= MS_RDONLY;
 	if (sbi->s_truncate)
 		sb->s_root->d_op = &sysv_dentry_operations;
-	sb->s_dirt = 1;
 	return 1;
 }
 
@@ -435,12 +434,46 @@ Ebadsize:
 	goto failed;
 }
 
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
+static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
 {
-	struct sysv_sb_info *sbi;
-	struct buffer_head *bh, *bh2 = NULL;
 	struct v7_super_block *v7sb;
 	struct sysv_inode *v7i;
+	struct buffer_head *bh2;
+	struct sysv_sb_info *sbi;
+
+	sbi = sb->s_fs_info;
+
+	/* plausibility check on superblock */
+	v7sb = (struct v7_super_block *) bh->b_data;
+	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+	    fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
+		return 0;
+
+	/* plausibility check on root inode: it is a directory,
+	   with a nonzero size that is a multiple of 16 */
+	bh2 = sb_bread(sb, 2);
+	if (bh2 == NULL)
+		return 0;
+
+	v7i = (struct sysv_inode *)(bh2->b_data + 64);
+	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
+	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+	     sizeof(struct sysv_dir_entry))) {
+		brelse(bh2);
+		return 0;
+	}
+
+	brelse(bh2);
+	return 1;
+}
+
+static int v7_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct sysv_sb_info *sbi;
+	struct buffer_head *bh;
 
 	if (440 != sizeof (struct v7_super_block))
 		panic("V7 FS: bad super-block size");
@@ -454,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_sb = sb;
 	sbi->s_block_base = 0;
 	sbi->s_type = FSTYPE_V7;
-	sbi->s_bytesex = BYTESEX_PDP;
 	sb->s_fs_info = sbi;
 	
 	sb_set_blocksize(sb, 512);
@@ -466,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed;
 	}
 
-	/* plausibility check on superblock */
-	v7sb = (struct v7_super_block *) bh->b_data;
-	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
-	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
-	    fs32_to_cpu(sbi, v7sb->s_time) == 0)
-		goto failed;
+	/* Try PDP-11 UNIX */
+	sbi->s_bytesex = BYTESEX_PDP;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
 
-	/* plausibility check on root inode: it is a directory,
-	   with a nonzero size that is a multiple of 16 */
-	if ((bh2 = sb_bread(sb, 2)) == NULL)
-		goto failed;
-	v7i = (struct sysv_inode *)(bh2->b_data + 64);
-	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
-	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
-	    (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
-		goto failed;
-	brelse(bh2);
-	bh2 = NULL;
+	/* Try PC/IX, v7/x86 */
+	sbi->s_bytesex = BYTESEX_LE;
+	if (v7_sanity_check(sb, bh))
+		goto detected;
 
+	goto failed;
+
+detected:
 	sbi->s_bh1 = bh;
 	sbi->s_bh2 = bh;
 	if (complete_read_super(sb, silent, 1))
 		return 0;
 
 failed:
-	brelse(bh2);
+	printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
+		sb->s_id);
 	brelse(bh);
 	kfree(sbi);
 	return -EINVAL;
@@ -499,23 +526,22 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static int sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
 }
 
-static int v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *v7_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
 }
 
 static struct file_system_type sysv_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "sysv",
-	.get_sb		= sysv_get_sb,
+	.mount		= sysv_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -523,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
 static struct file_system_type v7_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "v7",
-	.get_sb		= v7_get_sb,
+	.mount		= v7_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -560,4 +586,5 @@ static void __exit exit_sysv_fs(void)
 
 module_init(init_sysv_fs)
 module_exit(exit_sysv_fs)
+MODULE_ALIAS("v7");
 MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 94cb9b4d76c..bb55cdb394b 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -136,9 +136,7 @@ extern unsigned long sysv_count_free_blocks(struct super_block *);
 
 /* itree.c */
 extern void sysv_truncate(struct inode *);
-extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata);
+extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 /* inode.c */
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79..8c4fc1425b3 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
 	.poll		= timerfd_poll,
 	.read		= timerfd_read,
+	.llseek		= noop_llseek,
 };
 
 static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d..02429d81ca3 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
 	struct ubifs_lp_stats lst;
 
 	dbg_cmt("start");
-	if (c->ro_media) {
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_up;
 	}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782..0bee4dbffc3 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
 	return err;
 }
 
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t blka, blkb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sb->type != UBIFS_DATA_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		blka = key_block(c, &sa->key);
+		blkb = key_block(c, &sb->key);
+
+		if (blka > blkb) {
+			ubifs_err("larger block %u goes before %u", blka, blkb);
+			goto error_dump;
+		}
+		if (blka == blkb) {
+			ubifs_err("two data nodes for the same block");
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	dbg_dump_node(c, sa->node);
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+	struct list_head *cur;
+	struct ubifs_scan_node *sa, *sb;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	for (cur = head->next; cur->next != head; cur = cur->next) {
+		ino_t inuma, inumb;
+		uint32_t hasha, hashb;
+
+		cond_resched();
+		sa = container_of(cur, struct ubifs_scan_node, list);
+		sb = container_of(cur->next, struct ubifs_scan_node, list);
+
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sa->type);
+			dbg_dump_node(c, sa->node);
+			return -EINVAL;
+		}
+		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+		    sa->type != UBIFS_XENT_NODE) {
+			ubifs_err("bad node type %d", sb->type);
+			dbg_dump_node(c, sb->node);
+			return -EINVAL;
+		}
+
+		if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			ubifs_err("non-inode node goes before inode node");
+			goto error_dump;
+		}
+
+		if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+			continue;
+
+		if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+			/* Inode nodes are sorted in descending size order */
+			if (sa->len < sb->len) {
+				ubifs_err("smaller inode node goes first");
+				goto error_dump;
+			}
+			continue;
+		}
+
+		/*
+		 * This is either a dentry or xentry, which should be sorted in
+		 * ascending (parent ino, hash) order.
+		 */
+		inuma = key_inum(c, &sa->key);
+		inumb = key_inum(c, &sb->key);
+
+		if (inuma < inumb)
+			continue;
+		if (inuma > inumb) {
+			ubifs_err("larger inum %lu goes before inum %lu",
+				  (unsigned long)inuma, (unsigned long)inumb);
+			goto error_dump;
+		}
+
+		hasha = key_block(c, &sa->key);
+		hashb = key_block(c, &sb->key);
+
+		if (hasha > hashb) {
+			ubifs_err("larger hash %u goes before %u", hasha, hashb);
+			goto error_dump;
+		}
+	}
+
+	return 0;
+
+error_dump:
+	ubifs_msg("dumping first node");
+	dbg_dump_node(c, sa->node);
+	ubifs_msg("dumping second node");
+	dbg_dump_node(c, sb->node);
+	return -EINVAL;
+	return 0;
+}
+
 static int invocation_cnt;
 
 int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
+	.llseek = default_llseek,
 };
 
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea..69ebe472915 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
 			int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
 			 loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 
 /* Force the use of in-the-gaps method for testing */
 
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_check_inode_size(c, inode, size)       0
+#define dbg_check_data_nodes_order(c, head)        0
+#define dbg_check_nondata_nodes_order(c, head)     0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce7221..14f64b689d7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 
 	lock_2_inodes(dir, inode);
 	inc_nlink(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	inode->i_ctime = ubifs_current_time(inode);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 12f445cee9f..d77db7e3648 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
-	if (unlikely(c->ro_media))
+	if (unlikely(c->ro_error))
 		return -EROFS;
 
 	/* Try out the fast-path part first */
@@ -967,14 +968,15 @@ static int do_writepage(struct page *page, int len)
  * the page locked, and it locks @ui_mutex. However, write-back does take inode
  * @i_mutex, which means other VFS operations may be run on this inode at the
  * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'simple_setsize()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
  * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
+ * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
  * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
  * means that @inode->i_size is changed while @ui_mutex is unlocked.
  *
- * XXX: with the new truncate the above is not true anymore, the simple_setsize
- * calls can be replaced with the individual components.
+ * XXX(truncate): with the new truncate sequence this is not true anymore,
+ * and the calls to truncate_setsize can be move around freely.  They should
+ * be moved to the very end of the truncate sequence.
  *
  * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
  * inode size. How do we do this if @inode->i_size may became smaller while we
@@ -1128,9 +1130,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 		budgeted = 0;
 	}
 
-	err = simple_setsize(inode, new_size);
-	if (err)
-		goto out_budg;
+	truncate_setsize(inode, new_size);
 
 	if (offset) {
 		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
@@ -1217,16 +1217,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-		err = simple_setsize(inode, new_size);
-		if (err)
-			goto out;
+		truncate_setsize(inode, new_size);
 	}
 
 	mutex_lock(&ui->ui_mutex);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncation changes inode [mc]time */
 		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-		/* 'simple_setsize()' changed @i_size, update @ui_size */
+		/* 'truncate_setsize()' changed @i_size, update @ui_size */
 		ui->ui_size = inode->i_size;
 	}
 
@@ -1248,10 +1246,6 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	if (IS_SYNC(inode))
 		err = inode->i_sb->s_op->write_inode(inode, NULL);
 	return err;
-
-out:
-	ubifs_release_budget(c, &req);
-	return err;
 }
 
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -1446,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
 
 	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
 		i_size_read(inode));
-	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
-	if (unlikely(c->ro_media))
+	if (unlikely(c->ro_error))
 		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca0..151f1088282 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 	struct ubifs_scan_node *sa, *sb;
 
 	cond_resched();
+	if (a == b)
+		return 0;
+
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
+
 	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
 	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+	ubifs_assert(sa->type == UBIFS_DATA_NODE);
+	ubifs_assert(sb->type == UBIFS_DATA_NODE);
 
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
  */
 int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-	int typea, typeb;
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
 	struct ubifs_scan_node *sa, *sb;
 
 	cond_resched();
+	if (a == b)
+		return 0;
+
 	sa = list_entry(a, struct ubifs_scan_node, list);
 	sb = list_entry(b, struct ubifs_scan_node, list);
-	typea = key_type(c, &sa->key);
-	typeb = key_type(c, &sb->key);
-	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+	ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
+		     key_type(c, &sb->key) != UBIFS_DATA_KEY);
+	ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+		     sb->type != UBIFS_DATA_NODE);
 
 	/* Inodes go before directory entries */
-	if (typea == UBIFS_INO_KEY) {
-		if (typeb == UBIFS_INO_KEY)
+	if (sa->type == UBIFS_INO_NODE) {
+		if (sb->type == UBIFS_INO_NODE)
 			return sb->len - sa->len;
 		return -1;
 	}
-	if (typeb == UBIFS_INO_KEY)
+	if (sb->type == UBIFS_INO_NODE)
 		return 1;
 
-	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+		     key_type(c, &sb->key) == UBIFS_XENT_KEY);
+	ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+		     sa->type == UBIFS_XENT_NODE);
+	ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+		     sb->type == UBIFS_XENT_NODE);
+
 	inuma = key_inum(c, &sa->key);
 	inumb = key_inum(c, &sb->key);
 
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		      struct list_head *nondata, int *min)
 {
+	int err;
 	struct ubifs_scan_node *snod, *tmp;
 
 	*min = INT_MAX;
 
 	/* Separate data nodes and non-data nodes */
 	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		int err;
+		ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+			     snod->type == UBIFS_DATA_NODE ||
+			     snod->type == UBIFS_DENT_NODE ||
+			     snod->type == UBIFS_XENT_NODE ||
+			     snod->type == UBIFS_TRUN_NODE);
+
+		if (snod->type != UBIFS_INO_NODE  &&
+		    snod->type != UBIFS_DATA_NODE &&
+		    snod->type != UBIFS_DENT_NODE &&
+		    snod->type != UBIFS_XENT_NODE) {
+			/* Probably truncation node, zap it */
+			list_del(&snod->list);
+			kfree(snod);
+			continue;
+		}
 
-		ubifs_assert(snod->type != UBIFS_IDX_NODE);
-		ubifs_assert(snod->type != UBIFS_REF_NODE);
-		ubifs_assert(snod->type != UBIFS_CS_NODE);
+		ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
+			     key_type(c, &snod->key) == UBIFS_INO_KEY  ||
+			     key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+			     key_type(c, &snod->key) == UBIFS_XENT_KEY);
 
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	/* Sort data and non-data nodes */
 	list_sort(c, &sleb->nodes, &data_nodes_cmp);
 	list_sort(c, nondata, &nondata_nodes_cmp);
+
+	err = dbg_check_data_nodes_order(c, &sleb->nodes);
+	if (err)
+		return err;
+	err = dbg_check_nondata_nodes_order(c, nondata);
+	if (err)
+		return err;
 	return 0;
 }
 
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	ubifs_assert_cmt_locked(c);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (ubifs_gc_should_commit(c))
 		return -EAGAIN;
 
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		ret = -EROFS;
 		goto out_unlock;
 	}
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 
 		ret = ubifs_garbage_collect_leb(c, &lp);
 		if (ret < 0) {
-			if (ret == -EAGAIN || ret == -ENOSPC) {
+			if (ret == -EAGAIN) {
 				/*
-				 * These codes are not errors, so we have to
-				 * return the LEB to lprops. But if the
-				 * 'ubifs_return_leb()' function fails, its
-				 * failure code is propagated to the caller
-				 * instead of the original '-EAGAIN' or
-				 * '-ENOSPC'.
+				 * This is not error, so we have to return the
+				 * LEB to lprops. But if 'ubifs_return_leb()'
+				 * fails, its failure code is propagated to the
+				 * caller instead of the original '-EAGAIN'.
 				 */
 				err = ubifs_return_leb(c, lp.lnum);
 				if (err)
@@ -774,8 +814,8 @@ out_unlock:
 out:
 	ubifs_assert(ret < 0);
 	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
-	ubifs_ro_mode(c, ret);
 	ubifs_wbuf_sync_nolock(wbuf);
+	ubifs_ro_mode(c, ret);
 	mutex_unlock(&wbuf->io_mutex);
 	ubifs_return_leb(c, lp.lnum);
 	return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30b..d82173182ee 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
  */
 void ubifs_ro_mode(struct ubifs_info *c, int err)
 {
-	if (!c->ro_media) {
-		c->ro_media = 1;
+	if (!c->ro_error) {
+		c->ro_error = 1;
 		c->no_chk_data_crc = 0;
 		c->vfs_sb->s_flags |= MS_RDONLY;
 		ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 
 	dbg_io("LEB %d:%d, %d bytes, jhead %s",
 	       wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 	ubifs_assert(!(wbuf->avail & 7));
 	ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 {
 	int err, i;
 
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	if (!c->need_wbuf_sync)
 		return 0;
 	c->need_wbuf_sync = 0;
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_timers;
 	}
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
 	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
 		err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 
 	cancel_wbuf_timer_nolock(wbuf);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
 	       buf_len);
 	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
 	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 
-	if (c->ro_media)
+	if (c->ro_error)
 		return -EROFS;
 
 	ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 	return 0;
 
 out:
-	ubifs_err("bad node at LEB %d:%d", lnum, offs);
+	ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
+		  ubi_is_mapped(c->ubi, lnum));
 	dbg_dump_node(c, buf);
 	dbg_dump_stack();
 	return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68..914f1bd89e5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
 	 * better to try to allocate space at the ends of eraseblocks. This is
 	 * what the squeeze parameter does.
 	 */
+	ubifs_assert(!c->ro_media && !c->ro_mount);
 	squeeze = (jhead == BASEHD);
 again:
 	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
 
-	if (c->ro_media) {
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_unlock;
 	}
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0..92a8491a8f8 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
 }
 
 /**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+				    union ubifs_key *key)
+{
+	key->u32[0] = 0xDEADBEAF;
+	key->u32[1] = UBIFS_INVALID_KEY;
+}
+
+/**
  * key_type - get key type.
  * @c: UBIFS file-system description object
  * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42..4d0cb124146 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
 		jhead = &c->jheads[bud->jhead];
 		list_add_tail(&bud->list, &jhead->buds_list);
 	} else
-		ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+		ubifs_assert(c->replaying && c->ro_mount);
 
 	/*
 	 * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	mutex_lock(&c->log_mutex);
-
-	if (c->ro_media) {
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error) {
 		err = -EROFS;
 		goto out_unlock;
 	}
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827e..72775d35b99 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
 		goto out;
 	for (i = 0; i < c->lsave_cnt; i++) {
 		int lnum = c->lsave[i];
+		struct ubifs_lprops *lprops;
 
 		/*
 		 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
 		 */
 		if (lnum >= c->leb_cnt)
 			continue;
-		ubifs_lpt_lookup(c, lnum);
+		lprops = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lprops)) {
+			err = PTR_ERR(lprops);
+			goto out;
+		}
 	}
 out:
 	vfree(buf);
@@ -1457,13 +1462,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
 		shft -= UBIFS_LPT_FANOUT_SHIFT;
 		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
-			return ERR_PTR(PTR_ERR(nnode));
+			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
 	shft -= UBIFS_LPT_FANOUT_SHIFT;
 	pnode = ubifs_get_pnode(c, nnode, iip);
 	if (IS_ERR(pnode))
-		return ERR_PTR(PTR_ERR(pnode));
+		return ERR_CAST(pnode);
 	iip = (i & (UBIFS_LPT_FANOUT - 1));
 	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
 	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1591,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
 	nnode = c->nroot;
 	nnode = dirty_cow_nnode(c, nnode);
 	if (IS_ERR(nnode))
-		return ERR_PTR(PTR_ERR(nnode));
+		return ERR_CAST(nnode);
 	i = lnum - c->main_first;
 	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
 	for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1599,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
 		shft -= UBIFS_LPT_FANOUT_SHIFT;
 		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
-			return ERR_PTR(PTR_ERR(nnode));
+			return ERR_CAST(nnode);
 		nnode = dirty_cow_nnode(c, nnode);
 		if (IS_ERR(nnode))
-			return ERR_PTR(PTR_ERR(nnode));
+			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
 	shft -= UBIFS_LPT_FANOUT_SHIFT;
 	pnode = ubifs_get_pnode(c, nnode, iip);
 	if (IS_ERR(pnode))
-		return ERR_PTR(PTR_ERR(pnode));
+		return ERR_CAST(pnode);
 	pnode = dirty_cow_pnode(c, pnode);
 	if (IS_ERR(pnode))
-		return ERR_PTR(PTR_ERR(pnode));
+		return ERR_CAST(pnode);
 	iip = (i & (UBIFS_LPT_FANOUT - 1));
 	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
 	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237b..5c90dec5db0 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
 		shft -= UBIFS_LPT_FANOUT_SHIFT;
 		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
-			return ERR_PTR(PTR_ERR(nnode));
+			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
 	return ubifs_get_pnode(c, nnode, iip);
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
 	struct ubifs_pnode *pnode;
 
 	pnode = pnode_lookup(c, 0);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+
 	while (pnode) {
 		do_make_pnode_dirty(c, pnode);
 		pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc..21f47afdacf 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
 {
 	int err, lnum, offs, len;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
 		return -EROFS;
 
 	lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e4..c3de04dc952 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_unmap(c->ubi, lnum);
 	if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
 	if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
 {
 	int err;
 
-	if (c->ro_media)
+	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->ro_error)
 		return -EROFS;
 	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
 	if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb..77e9b874b6c 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
  * This file implements functions needed to recover from unclean un-mounts.
  * When UBIFS is mounted, it checks a flag on the master node to determine if
  * an un-mount was completed successfully. If not, the process of mounting
- * incorparates additional checking and fixing of on-flash data structures.
+ * incorporates additional checking and fixing of on-flash data structures.
  * UBIFS always cleans away all remnants of an unclean un-mount, so that
  * errors do not accumulate. However UBIFS defers recovery if it is mounted
  * read-only, and the flash is not modified in that case.
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
 
 	memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
 
-	if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+	if (c->ro_mount) {
 		/* Read-only mode. Keep a copy for switching to rw mode */
 		c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
 		if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		endpt = snod->offs + snod->len;
 	}
 
-	if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+	if (c->ro_mount && !c->remounting_rw) {
 		/* Add to recovery list */
 		struct ubifs_unclean_leb *ucleb;
 
@@ -772,7 +772,8 @@ out_free:
  * @sbuf: LEB-sized buffer to use
  *
  * This function does a scan of a LEB, but caters for errors that might have
- * been caused by the unclean unmount from which we are attempting to recover.
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
  *
  * This function returns %0 on success and a negative error code on failure.
  */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
 {
 	int err;
 
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+	ubifs_assert(!c->ro_mount || c->remounting_rw);
 
 	dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
 	err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1063,8 +1064,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 	}
 	err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
 	if (err) {
-		if (err == -ENOSPC)
-			dbg_err("could not find a dirty LEB");
+		/*
+		 * There are no dirty or empty LEBs subject to here being
+		 * enough for the index. Try to use
+		 * 'ubifs_find_free_leb_for_idx()', which will return any empty
+		 * LEBs (ignoring index requirements). If the index then
+		 * doesn't have enough LEBs the recovery commit will fail -
+		 * which is the  same result anyway i.e. recovery fails. So
+		 * there is no problem ignoring index  requirements and just
+		 * grabbing a free LEB since we have already established there
+		 * is not a dirty LEB we could have used instead.
+		 */
+		if (err == -ENOSPC) {
+			dbg_rcvry("could not find a dirty LEB");
+			goto find_free;
+		}
 		return err;
 	}
 	ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1153,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 find_free:
 	/*
 	 * There is no GC head LEB or the free space in the GC head LEB is too
-	 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
-	 * GC is not run.
+	 * small, or there are not dirty LEBs. Allocate gc_lnum by calling
+	 * 'ubifs_find_free_leb_for_idx()' so GC is not run.
 	 */
 	lnum = ubifs_find_free_leb_for_idx(c);
 	if (lnum < 0) {
@@ -1448,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
 			}
 		}
 		if (e->exists && e->i_size < e->d_size) {
-			if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+			if (!e->inode && c->ro_mount) {
 				/* Fix the inode size and pin it in memory */
 				struct inode *inode;
 
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3..eed0fcff8d7 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
 	ubifs_assert(sleb->endpt - offs >= used);
 	ubifs_assert(sleb->endpt % c->min_io_size == 0);
 
-	if (sleb->endpt + c->min_io_size <= c->leb_size &&
-	    !(c->vfs_sb->s_flags & MS_RDONLY))
+	if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
 		err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
 					     sleb->endpt, UBI_SHORTTERM);
 
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	if (IS_ERR(sleb)) {
 		if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
 			return PTR_ERR(sleb);
+		/*
+		 * Note, the below function will recover this log LEB only if
+		 * it is the last, because unclean reboots can possibly corrupt
+		 * only the tail of the log.
+		 */
 		sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
 		if (IS_ERR(sleb))
 			return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	}
 
 	node = sleb->buf;
-
 	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
 	if (c->cs_sqnum == 0) {
 		/*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 	}
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
-
 		cond_resched();
 
 		if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
 int ubifs_replay_journal(struct ubifs_info *c)
 {
 	int err, i, lnum, offs, free;
-	void *sbuf = NULL;
 
 	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
 
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
-	sbuf = vmalloc(c->leb_size);
-	if (!sbuf)
-		return -ENOMEM;
-
 	dbg_mnt("start replaying the journal");
-
 	c->replaying = 1;
-
 	lnum = c->ltail_lnum = c->lhead_lnum;
 	offs = c->lhead_offs;
 
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 			lnum = UBIFS_LOG_LNUM;
 			offs = 0;
 		}
-		err = replay_log_leb(c, lnum, offs, sbuf);
+		err = replay_log_leb(c, lnum, offs, c->sbuf);
 		if (err == 1)
 			/* We hit the end of the log */
 			break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 out:
 	destroy_replay_tree(c);
 	destroy_bud_list(c);
-	vfree(sbuf);
 	c->replaying = 0;
 	return err;
 }
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9d..bf31b4729e5 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	 * due to the unavailability of time-travelling equipment.
 	 */
 	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-		struct super_block *sb = c->vfs_sb;
-		int mounting_ro = sb->s_flags & MS_RDONLY;
-
-		ubifs_assert(!c->ro_media || mounting_ro);
-		if (!mounting_ro ||
+		ubifs_assert(!c->ro_media || c->ro_mount);
+		if (!c->ro_mount ||
 		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
 			ubifs_err("on-flash format version is w%d/r%d, but "
 				  "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->old_leb_cnt = c->leb_cnt;
 	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
 		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
-		if (c->vfs_sb->s_flags & MS_RDONLY)
+		if (c->ro_mount)
 			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
 				c->old_leb_cnt,	c->leb_cnt);
 		else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c52538419..3e1ee57dbea 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	struct ubifs_ino_node *ino = buf;
 	struct ubifs_scan_node *snod;
 
-	snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+	snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
 	if (!snod)
 		return -ENOMEM;
 
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 	case UBIFS_DENT_NODE:
 	case UBIFS_XENT_NODE:
 	case UBIFS_DATA_NODE:
-	case UBIFS_TRUN_NODE:
 		/*
 		 * The key is in the same place in all keyed
 		 * nodes.
 		 */
 		key_read(c, &ino->key, &snod->key);
 		break;
+	default:
+		invalid_key_init(c, &snod->key);
+		break;
 	}
 	list_add_tail(&snod->list, &sleb->nodes);
 	sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5a..46961c00323 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
 			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
 
 			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
-			    c->ro_media) {
+			    c->ro_mount || c->ro_error) {
 				mutex_unlock(&c->umount_mutex);
 				continue;
 			}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3..91fac54c70e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -327,7 +327,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-static void ubifs_delete_inode(struct inode *inode)
+static void ubifs_evict_inode(struct inode *inode)
 {
 	int err;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -343,9 +343,12 @@ static void ubifs_delete_inode(struct inode *inode)
 
 	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
 	ubifs_assert(!atomic_read(&inode->i_count));
-	ubifs_assert(inode->i_nlink == 0);
 
 	truncate_inode_pages(&inode->i_data, 0);
+
+	if (inode->i_nlink)
+		goto done;
+
 	if (is_bad_inode(inode))
 		goto out;
 
@@ -367,7 +370,8 @@ out:
 		c->nospace = c->nospace_rp = 0;
 		smp_wmb();
 	}
-	clear_inode(inode);
+done:
+	end_writeback(inode);
 }
 
 static void ubifs_dirty_inode(struct inode *inode)
@@ -1133,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
  */
 static int mount_ubifs(struct ubifs_info *c)
 {
-	struct super_block *sb = c->vfs_sb;
-	int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+	int err;
 	long long x;
 	size_t sz;
 
+	c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
 	err = init_constants_early(c);
 	if (err)
 		return err;
@@ -1150,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_free;
 
-	if (c->empty && (mounted_read_only || c->ro_media)) {
+	if (c->empty && (c->ro_mount || c->ro_media)) {
 		/*
 		 * This UBI volume is empty, and read-only, or the file system
 		 * is mounted read-only - we cannot format it.
@@ -1161,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_free;
 	}
 
-	if (c->ro_media && !mounted_read_only) {
+	if (c->ro_media && !c->ro_mount) {
 		ubifs_err("cannot mount read-write - read-only media");
 		err = -EROFS;
 		goto out_free;
@@ -1181,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (!c->sbuf)
 		goto out_free;
 
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		c->ileb_buf = vmalloc(c->leb_size);
 		if (!c->ileb_buf)
 			goto out_free;
@@ -1224,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	}
 
 	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		err = alloc_wbufs(c);
 		if (err)
 			goto out_cbuf;
@@ -1250,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
 	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
 		ubifs_msg("recovery needed");
 		c->need_recovery = 1;
-		if (!mounted_read_only) {
+		if (!c->ro_mount) {
 			err = ubifs_recover_inl_heads(c, c->sbuf);
 			if (err)
 				goto out_master;
 		}
-	} else if (!mounted_read_only) {
+	} else if (!c->ro_mount) {
 		/*
 		 * Set the "dirty" flag so that if we reboot uncleanly we
 		 * will notice this immediately on the next mount.
@@ -1266,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_master;
 	}
 
-	err = ubifs_lpt_init(c, 1, !mounted_read_only);
+	err = ubifs_lpt_init(c, 1, !c->ro_mount);
 	if (err)
 		goto out_lpt;
 
@@ -1281,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
 	/* Calculate 'min_idx_lebs' after journal replay */
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 
-	err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+	err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
 	if (err)
 		goto out_orphans;
 
-	if (!mounted_read_only) {
+	if (!c->ro_mount) {
 		int lnum;
 
 		err = check_free_space(c);
@@ -1307,6 +1311,8 @@ static int mount_ubifs(struct ubifs_info *c)
 			if (err)
 				goto out_orphans;
 			err = ubifs_rcvry_gc_commit(c);
+			if (err)
+				goto out_orphans;
 		} else {
 			err = take_gc_lnum(c);
 			if (err)
@@ -1318,7 +1324,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			 */
 			err = ubifs_leb_unmap(c, c->gc_lnum);
 			if (err)
-				return err;
+				goto out_orphans;
 		}
 
 		err = dbg_check_lprops(c);
@@ -1345,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	spin_unlock(&ubifs_infos_lock);
 
 	if (c->need_recovery) {
-		if (mounted_read_only)
+		if (c->ro_mount)
 			ubifs_msg("recovery deferred");
 		else {
 			c->need_recovery = 0;
@@ -1372,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
-	if (mounted_read_only)
+	if (c->ro_mount)
 		ubifs_msg("mounted read-only");
 	x = (long long)c->main_lebs * c->leb_size;
 	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1634,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	}
 
 	dbg_gen("re-mounted read-write");
-	c->vfs_sb->s_flags &= ~MS_RDONLY;
+	c->ro_mount = 0;
 	c->remounting_rw = 0;
 	c->always_chk_crc = 0;
 	err = dbg_check_space_info(c);
@@ -1670,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
-	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+	ubifs_assert(!c->ro_mount);
 
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
@@ -1680,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 
 	dbg_save_space_info(c);
 
-	for (i = 0; i < c->jhead_cnt; i++) {
+	for (i = 0; i < c->jhead_cnt; i++)
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		hrtimer_cancel(&c->jheads[i].wbuf.timer);
-	}
 
 	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
 	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1698,6 +1702,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
+	c->ro_mount = 1;
 	err = dbg_check_space_info(c);
 	if (err)
 		ubifs_ro_mode(c, err);
@@ -1729,7 +1734,7 @@ static void ubifs_put_super(struct super_block *sb)
 	 * the mutex is locked.
 	 */
 	mutex_lock(&c->umount_mutex);
-	if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+	if (!c->ro_mount) {
 		/*
 		 * First of all kill the background thread to make sure it does
 		 * not interfere with un-mounting and freeing resources.
@@ -1739,23 +1744,22 @@ static void ubifs_put_super(struct super_block *sb)
 			c->bgt = NULL;
 		}
 
-		/* Synchronize write-buffers */
-		if (c->jheads)
-			for (i = 0; i < c->jhead_cnt; i++)
-				ubifs_wbuf_sync(&c->jheads[i].wbuf);
-
 		/*
-		 * On fatal errors c->ro_media is set to 1, in which case we do
+		 * On fatal errors c->ro_error is set to 1, in which case we do
 		 * not write the master node.
 		 */
-		if (!c->ro_media) {
+		if (!c->ro_error) {
+			int err;
+
+			/* Synchronize write-buffers */
+			for (i = 0; i < c->jhead_cnt; i++)
+				ubifs_wbuf_sync(&c->jheads[i].wbuf);
+
 			/*
 			 * We are being cleanly unmounted which means the
 			 * orphans were killed - indicate this in the master
 			 * node. Also save the reserved GC LEB number.
 			 */
-			int err;
-
 			c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
 			c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
 			c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1768,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
 				 */
 				ubifs_err("failed to write master node, "
 					  "error %d", err);
+		} else {
+			for (i = 0; i < c->jhead_cnt; i++)
+				/* Make sure write-buffer timers are canceled */
+				hrtimer_cancel(&c->jheads[i].wbuf.timer);
 		}
 	}
 
@@ -1791,17 +1799,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
-	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+	if (c->ro_mount && !(*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/W due to prior errors");
+			return -EROFS;
+		}
 		if (c->ro_media) {
-			ubifs_msg("cannot re-mount due to prior errors");
+			ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
-		if (c->ro_media) {
-			ubifs_msg("cannot re-mount due to prior errors");
+	} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
+		if (c->ro_error) {
+			ubifs_msg("cannot re-mount R/O due to prior errors");
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
@@ -1824,7 +1836,7 @@ const struct super_operations ubifs_super_operations = {
 	.destroy_inode = ubifs_destroy_inode,
 	.put_super     = ubifs_put_super,
 	.write_inode   = ubifs_write_inode,
-	.delete_inode  = ubifs_delete_inode,
+	.evict_inode   = ubifs_evict_inode,
 	.statfs        = ubifs_statfs,
 	.dirty_inode   = ubifs_dirty_inode,
 	.remount_fs    = ubifs_remount_fs,
@@ -2026,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
 	return c->vi.cdev == *dev;
 }
 
-static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *name, void *data, struct vfsmount *mnt)
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
+			const char *name, void *data)
 {
 	struct ubi_volume_desc *ubi;
 	struct ubi_volume_info vi;
@@ -2043,9 +2055,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		ubifs_err("cannot open \"%s\", error %d",
-			  name, (int)PTR_ERR(ubi));
-		return PTR_ERR(ubi);
+		dbg_err("cannot open \"%s\", error %d",
+			name, (int)PTR_ERR(ubi));
+		return ERR_CAST(ubi);
 	}
 	ubi_get_volume_info(ubi, &vi);
 
@@ -2058,9 +2070,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	if (sb->s_root) {
+		struct ubifs_info *c1 = sb->s_fs_info;
+
 		/* A new mount point for already mounted UBIFS */
 		dbg_gen("this ubi volume is already mounted");
-		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+		if (!!(flags & MS_RDONLY) != c1->ro_mount) {
 			err = -EBUSY;
 			goto out_deact;
 		}
@@ -2081,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	simple_set_mnt(mnt, sb);
-	return 0;
+	return dget(sb->s_root);
 
 out_deact:
 	deactivate_locked_super(sb);
 out_close:
 	ubi_close_volume(ubi);
-	return err;
+	return ERR_PTR(err);
 }
 
 static struct file_system_type ubifs_fs_type = {
 	.name    = "ubifs",
 	.owner   = THIS_MODULE,
-	.get_sb  = ubifs_get_sb,
+	.mount   = ubifs_mount,
 	.kill_sb = kill_anon_super,
 };
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e..ad9cf013362 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	unsigned long time = get_seconds();
 
 	dbg_tnc("search key %s", DBGKEY(key));
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
 
 	znode = c->zroot.znode;
 	if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
  *
  * This function searches an indexing node by its first key @key and its
  * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
- * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * nodes it traverses to TNC. This function is called for indexing nodes which
  * were found on the media by scanning, for example when garbage-collecting or
  * when doing in-the-gaps commit. This means that the indexing node which is
  * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
 	struct ubifs_znode *znode, *zn;
 	int n, nn;
 
+	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
+
 	/*
 	 * The arguments have probably been read off flash, so don't assume
 	 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 04310878f44..381d6b207a5 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
  * in TNC. However, when replaying, it is handy to introduce fake "truncation"
  * keys for truncation nodes because the code becomes simpler. So we define
  * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
  */
-#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
 
 /*
  * How much a directory entry/extended attribute entry adds to the parent/host
@@ -379,7 +383,7 @@ struct ubifs_gced_idx_leb {
  * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
  * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
  * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
  * with 'ubifs_writepage()' (see file.c). All the other inode fields are
  * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
  * could consider to rework locking and base it on "shadow" fields.
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
  *
  * @dirty_pg_cnt: number of dirty pages (not used)
  * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
  * @replay_sqnum: sequence number of node currently being replayed
  * @need_recovery: file-system needs recovery
  * @replaying: set to %1 during journal replay
- * @unclean_leb_list: LEBs to recover when mounting ro to rw
- * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
  * @size_tree: inode size information for recovery
- * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
- * @always_chk_crc: always check CRCs (while mounting and remounting rw)
+ * @remounting_rw: set while re-mounting from R/O mode to R/W mode
+ * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
+ *                  mode)
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
 	int leb_cnt;
 	int max_leb_cnt;
 	int old_leb_cnt;
-	int ro_media;
+	unsigned int ro_media:1;
+	unsigned int ro_mount:1;
+	unsigned int ro_error:1;
 
 	atomic_long_t dirty_pg_cnt;
 	atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bc..f8def3c8ea4 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
 config UDF_FS
 	tristate "UDF file system support"
+	depends on BKL # needs serious work to remove
 	select CRC_ITU_T
 	help
 	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 94e06d6bddb..66b9e7e7e4c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
-#include <linux/smp_lock.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -228,6 +227,28 @@ const struct file_operations udf_file_operations = {
 	.llseek			= generic_file_llseek,
 };
 
+static int udf_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
+}
+
 const struct inode_operations udf_file_inode_operations = {
+	.setattr		= udf_setattr,
 	.truncate		= udf_truncate,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 18cd7111185..75d9304d0dc 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -31,8 +31,6 @@ void udf_free_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
-	clear_inode(inode);
-
 	mutex_lock(&sbi->s_alloc_mutex);
 	if (sbi->s_lvid_bh) {
 		struct logicalVolIntegrityDescImpUse *lvidiu =
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 124852bcf6f..fc48f37aa2d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -68,37 +68,23 @@ static void udf_update_extents(struct inode *,
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 
-void udf_delete_inode(struct inode *inode)
-{
-	truncate_inode_pages(&inode->i_data, 0);
-
-	if (is_bad_inode(inode))
-		goto no_delete;
-
-	inode->i_size = 0;
-	udf_truncate(inode);
-	lock_kernel();
-
-	udf_update_inode(inode, IS_SYNC(inode));
-	udf_free_inode(inode);
-
-	unlock_kernel();
-	return;
-
-no_delete:
-	clear_inode(inode);
-}
-
-/*
- * If we are going to release inode from memory, we truncate last inode extent
- * to proper length. We could use drop_inode() but it's called under inode_lock
- * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
- * to make sure to write inode as it's not written automatically.
- */
-void udf_clear_inode(struct inode *inode)
+void udf_evict_inode(struct inode *inode)
 {
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	int want_delete = 0;
+
+	truncate_inode_pages(&inode->i_data, 0);
 
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		want_delete = 1;
+		inode->i_size = 0;
+		udf_truncate(inode);
+		lock_kernel();
+		udf_update_inode(inode, IS_SYNC(inode));
+		unlock_kernel();
+	}
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
 	    inode->i_size != iinfo->i_lenExtents) {
 		printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
@@ -108,9 +94,13 @@ void udf_clear_inode(struct inode *inode)
 			(unsigned long long)inode->i_size,
 			(unsigned long long)iinfo->i_lenExtents);
 	}
-
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
+	if (want_delete) {
+		lock_kernel();
+		udf_free_inode(inode);
+		unlock_kernel();
+	}
 }
 
 static int udf_writepage(struct page *page, struct writeback_control *wbc)
@@ -127,9 +117,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				udf_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t udf_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193..6d8dc02baeb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	unlock_kernel();
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 612d1e2e285..4a5c7c61836 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 }
 
 /* UDF filesystem type */
-static int udf_get_sb(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data,
-		      struct vfsmount *mnt)
+static struct dentry *udf_mount(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
 }
 
 static struct file_system_type udf_fstype = {
 	.owner		= THIS_MODULE,
 	.name		= "udf",
-	.get_sb		= udf_get_sb,
+	.mount		= udf_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
@@ -175,8 +174,7 @@ static const struct super_operations udf_sb_ops = {
 	.alloc_inode	= udf_alloc_inode,
 	.destroy_inode	= udf_destroy_inode,
 	.write_inode	= udf_write_inode,
-	.delete_inode	= udf_delete_inode,
-	.clear_inode	= udf_clear_inode,
+	.evict_inode	= udf_evict_inode,
 	.put_super	= udf_put_super,
 	.sync_fs	= udf_sync_fs,
 	.statfs		= udf_statfs,
@@ -1579,9 +1577,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
 {
 	struct anchorVolDescPtr *anchor;
 	long main_s, main_e, reserve_s, reserve_e;
-	struct udf_sb_info *sbi;
 
-	sbi = UDF_SB(sb);
 	anchor = (struct anchorVolDescPtr *)bh->b_data;
 
 	/* Locate the main sequence */
@@ -1883,6 +1879,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	struct kernel_lb_addr rootdir, fileset;
 	struct udf_sb_info *sbi;
 
+	lock_kernel();
+
 	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
 	uopt.uid = -1;
 	uopt.gid = -1;
@@ -1891,8 +1889,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	uopt.dmode = UDF_INVALID_MODE;
 
 	sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-	if (!sbi)
+	if (!sbi) {
+		unlock_kernel();
 		return -ENOMEM;
+	}
 
 	sb->s_fs_info = sbi;
 
@@ -2038,6 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		goto error_out;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	unlock_kernel();
 	return 0;
 
 error_out:
@@ -2058,6 +2059,7 @@ error_out:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 
+	unlock_kernel();
 	return -EINVAL;
 }
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 2bac0354891..6995ab1f430 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -139,8 +139,7 @@ extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
 extern void udf_truncate(struct inode *);
 extern void udf_read_inode(struct inode *);
-extern void udf_delete_inode(struct inode *);
-extern void udf_clear_inode(struct inode *);
+extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768..30c8f223253 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
+	depends on BKL # probably fixable
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d..46f7a807bbc 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 	
 	unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 
 	if (overflow) {
 		fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
 succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index ec784756dc6..dbc90994715 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -95,8 +95,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 	int err;
 
 	lock_page(page);
-	err = __ufs_write_begin(NULL, page->mapping, pos, len,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = ufs_prepare_chunk(page, pos, len);
 	BUG_ON(err);
 
 	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
@@ -381,8 +380,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 got_it:
 	pos = page_offset(page) +
 			(char*)de - (char*)page_address(page);
-	err = __ufs_write_begin(NULL, page->mapping, pos, rec_len,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = ufs_prepare_chunk(page, pos, rec_len);
 	if (err)
 		goto out_unlock;
 	if (de->d_ino) {
@@ -518,7 +516,6 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 		     struct page * page)
 {
 	struct super_block *sb = inode->i_sb;
-	struct address_space *mapping = page->mapping;
 	char *kaddr = page_address(page);
 	unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
 	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
@@ -549,8 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 
 	pos = page_offset(page) + from;
 	lock_page(page);
-	err = __ufs_write_begin(NULL, mapping, pos, to - from,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = ufs_prepare_chunk(page, pos, to - from);
 	BUG_ON(err);
 	if (pde)
 		pde->d_reclen = cpu_to_fs16(sb, to - from);
@@ -577,8 +573,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 	if (!page)
 		return -ENOMEM;
 
-	err = __ufs_write_begin(NULL, mapping, 0, chunk_size,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = ufs_prepare_chunk(page, 0, chunk_size);
 	if (err) {
 		unlock_page(page);
 		goto fail;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 594480e537d..2eabf04af3d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -94,8 +94,6 @@ void ufs_free_inode (struct inode * inode)
 
 	is_directory = S_ISDIR(inode->i_mode);
 
-	clear_inode (inode);
-
 	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
 	else {
@@ -115,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	
 	sb->s_dirt = 1;
 	unlock_super (sb);
@@ -158,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
 
 	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
 	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer(UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 
 	UFSD("EXIT\n");
 }
@@ -292,10 +286,8 @@ cg_found:
 	}
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 73fe773aa03..2b251f2093a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -558,20 +558,26 @@ static int ufs_readpage(struct file *file, struct page *page)
 	return block_read_full_page(page,ufs_getfrag_block);
 }
 
-int __ufs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata)
+int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				ufs_getfrag_block);
+	return __block_write_begin(page, pos, len, ufs_getfrag_block);
 }
 
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return __ufs_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ufs_getfrag_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
@@ -905,24 +911,33 @@ int ufs_sync_inode (struct inode *inode)
 	return ufs_update_inode (inode, 1);
 }
 
-void ufs_delete_inode (struct inode * inode)
+void ufs_evict_inode(struct inode * inode)
 {
-	loff_t old_i_size;
+	int want_delete = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode))
+		want_delete = 1;
 
 	truncate_inode_pages(&inode->i_data, 0);
-	if (is_bad_inode(inode))
-		goto no_delete;
-	/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-	lock_kernel();
-	mark_inode_dirty(inode);
-	ufs_update_inode(inode, IS_SYNC(inode));
-	old_i_size = inode->i_size;
-	inode->i_size = 0;
-	if (inode->i_blocks && ufs_truncate(inode, old_i_size))
-		ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-	ufs_free_inode (inode);
-	unlock_kernel();
-	return;
-no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	if (want_delete) {
+		loff_t old_i_size;
+		/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
+		lock_kernel();
+		mark_inode_dirty(inode);
+		ufs_update_inode(inode, IS_SYNC(inode));
+		old_i_size = inode->i_size;
+		inode->i_size = 0;
+		if (inode->i_blocks && ufs_truncate(inode, old_i_size))
+			ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
+		unlock_kernel();
+	}
+
+	invalidate_inode_buffers(inode);
+	end_writeback(inode);
+
+	if (want_delete) {
+		lock_kernel();
+		ufs_free_inode (inode);
+		unlock_kernel();
+	}
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb..12f39b9e443 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	error = ufs_add_nondir(dentry, inode);
 	unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3ec5a9eb6ef..2c47daed56d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned maxsymlen;
 	int ret = -EINVAL;
 
+	lock_kernel();
+
 	uspi = NULL;
 	ubh = NULL;
 	flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
 			goto failed;
 
 	UFSD("EXIT\n");
+	unlock_kernel();
 	return 0;
 
 dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 	UFSD("EXIT (FAILED)\n");
+	unlock_kernel();
 	return ret;
 
 failed_nomem:
 	UFSD("EXIT (NOMEM)\n");
+	unlock_kernel();
 	return -ENOMEM;
 }
 
@@ -1440,7 +1445,7 @@ static const struct super_operations ufs_super_ops = {
 	.alloc_inode	= ufs_alloc_inode,
 	.destroy_inode	= ufs_destroy_inode,
 	.write_inode	= ufs_write_inode,
-	.delete_inode	= ufs_delete_inode,
+	.evict_inode	= ufs_evict_inode,
 	.put_super	= ufs_put_super,
 	.write_super	= ufs_write_super,
 	.sync_fs	= ufs_sync_fs,
@@ -1449,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
 	.show_options   = ufs_show_options,
 };
 
-static int ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
 }
 
 static struct file_system_type ufs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ufs",
-	.get_sb		= ufs_get_sb,
+	.mount		= ufs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 589e01a465b..a58f9155fc9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
 		ubh_bforget(ind_ubh);
 		ind_ubh = NULL;
 	}
-	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-		ubh_ll_rw_block(SWRITE, ind_ubh);
-		ubh_wait_on_buffer (ind_ubh);
-	}
+	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
+		ubh_sync_block(ind_ubh);
 	ubh_brelse (ind_ubh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
 		ubh_bforget(dind_bh);
 		dind_bh = NULL;
 	}
-	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-		ubh_ll_rw_block(SWRITE, dind_bh);
-		ubh_wait_on_buffer (dind_bh);
-	}
+	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
+		ubh_sync_block(dind_bh);
 	ubh_brelse (dind_bh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
 		ubh_bforget(tind_bh);
 		tind_bh = NULL;
 	}
-	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-		ubh_ll_rw_block(SWRITE, tind_bh);
-		ubh_wait_on_buffer (tind_bh);
-	}
+	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
+		ubh_sync_block(tind_bh);
 	ubh_brelse (tind_bh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -500,11 +494,6 @@ out:
 	return err;
 }
 
-/*
- * TODO:
- *	- truncate case should use proper ordering instead of using
- *	  simple_setsize
- */
 int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -518,14 +507,17 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		loff_t old_i_size = inode->i_size;
 
-		error = simple_setsize(inode, attr->ia_size);
-		if (error)
-			return error;
+		/* XXX(truncate): truncate_setsize should be called last */
+		truncate_setsize(inode, attr->ia_size);
+
 		error = ufs_truncate(inode, old_i_size);
 		if (error)
 			return error;
 	}
-	return inode_setattr(inode, attr);
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations ufs_file_inode_operations = {
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 179ae6b3180..c08782e1b48 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -108,7 +108,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
-extern void ufs_delete_inode (struct inode *);
+extern void ufs_evict_inode (struct inode *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4..d2c36d53fe6 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
 	}
 }
 
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
 {
-	if (!ubh)
-		return;
+	if (ubh) {
+		unsigned i;
 
-	ll_rw_block(rw, ubh->count, ubh->bh);
-}
+		for (i = 0; i < ubh->count; i++)
+			write_dirty_buffer(ubh->bh[i], WRITE);
 
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
-{
-	unsigned i;
-	if (!ubh)
-		return;
-	for ( i = 0; i < ubh->count; i++ )
-		wait_on_buffer (ubh->bh[i]);
+		for (i = 0; i < ubh->count; i++)
+			wait_on_buffer(ubh->bh[i]);
+	}
 }
 
 void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 23ceed8c8fb..9f8775ce381 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -257,9 +257,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
 
 extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
 extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
-extern int __ufs_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata);
+extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 /*
  * These functions manipulate ufs buffers
@@ -271,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d37..179b5869065 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+	       int flags)
 {
 	int error = -EINVAL;
 
@@ -170,7 +171,7 @@ out:
 	return error;
 }
 
-SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
 		struct timespec __user *, utimes, int, flags)
 {
 	struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
-SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
 		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f0..6100ec0fa1d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
+	select QUOTACTL
 	help
 	  If you say Y here, you will be able to set limits for disk usage on
 	  a per user and/or a per group basis under XFS.  XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c8fb13f83b3..0dce969d6ca 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -87,11 +87,9 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_trans_buf.o \
 				   xfs_trans_extfree.o \
 				   xfs_trans_inode.o \
-				   xfs_trans_item.o \
 				   xfs_utils.o \
 				   xfs_vnodeops.o \
-				   xfs_rw.o \
-				   xfs_dmops.o
+				   xfs_rw.o
 
 xfs-$(CONFIG_XFS_TRACE)		+= xfs_btree_trace.o
 
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 9f769b5b38f..b2771862fd3 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_check_acl(ip);
 
 	/*
 	 * If there is no attribute fork no ACL exists on this inode and
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34640d6dbdc..c9af48fffcd 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
@@ -92,18 +85,15 @@ void
 xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
-	int			*unmapped,
 	int			*unwritten)
 {
 	struct buffer_head	*bh, *head;
 
-	*delalloc = *unmapped = *unwritten = 0;
+	*delalloc = *unwritten = 0;
 
 	bh = head = page_buffers(page);
 	do {
-		if (buffer_uptodate(bh) && !buffer_mapped(bh))
-			(*unmapped) = 1;
-		else if (buffer_unwritten(bh))
+		if (buffer_unwritten(bh))
 			(*unwritten) = 1;
 		else if (buffer_delay(bh))
 			(*delalloc) = 1;
@@ -212,23 +202,17 @@ xfs_setfilesize(
 }
 
 /*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
+ * Schedule IO completion handling on the final put of an ioend.
  */
 STATIC void
 xfs_finish_ioend(
-	xfs_ioend_t	*ioend,
-	int		wait)
+	struct xfs_ioend	*ioend)
 {
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		struct workqueue_struct *wq;
-
-		wq = (ioend->io_type == IO_UNWRITTEN) ?
-			xfsconvertd_workqueue : xfsdatad_workqueue;
-		queue_work(wq, &ioend->io_work);
-		if (wait)
-			flush_workqueue(wq);
+		if (ioend->io_type == IO_UNWRITTEN)
+			queue_work(xfsconvertd_workqueue, &ioend->io_work);
+		else
+			queue_work(xfsdatad_workqueue, &ioend->io_work);
 	}
 }
 
@@ -272,11 +256,25 @@ xfs_end_io(
 	 */
 	if (error == EAGAIN) {
 		atomic_inc(&ioend->io_remaining);
-		xfs_finish_ioend(ioend, 0);
+		xfs_finish_ioend(ioend);
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
-	} else
+	} else {
+		if (ioend->io_iocb)
+			aio_complete(ioend->io_iocb, ioend->io_result, 0);
 		xfs_destroy_ioend(ioend);
+	}
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining))
+		xfs_end_io(&ioend->io_work);
 }
 
 /*
@@ -309,6 +307,8 @@ xfs_alloc_ioend(
 	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
+	ioend->io_iocb = NULL;
+	ioend->io_result = 0;
 
 	INIT_WORK(&ioend->io_work, xfs_end_io);
 	return ioend;
@@ -358,7 +358,7 @@ xfs_end_bio(
 	bio->bi_end_io = NULL;
 	bio_put(bio);
 
-	xfs_finish_ioend(ioend, 0);
+	xfs_finish_ioend(ioend);
 }
 
 STATIC void
@@ -500,7 +500,7 @@ xfs_submit_ioend(
 		}
 		if (bio)
 			xfs_submit_ioend_bio(wbc, ioend, bio);
-		xfs_finish_ioend(ioend, 0);
+		xfs_finish_ioend(ioend);
 	} while ((ioend = next) != NULL);
 }
 
@@ -614,31 +614,30 @@ xfs_map_at_offset(
 STATIC unsigned int
 xfs_probe_page(
 	struct page		*page,
-	unsigned int		pg_offset,
-	int			mapped)
+	unsigned int		pg_offset)
 {
+	struct buffer_head	*bh, *head;
 	int			ret = 0;
 
 	if (PageWriteback(page))
 		return 0;
+	if (!PageDirty(page))
+		return 0;
+	if (!page->mapping)
+		return 0;
+	if (!page_has_buffers(page))
+		return 0;
 
-	if (page->mapping && PageDirty(page)) {
-		if (page_has_buffers(page)) {
-			struct buffer_head	*bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				if (!buffer_uptodate(bh))
-					break;
-				if (mapped != buffer_mapped(bh))
-					break;
-				ret += bh->b_size;
-				if (ret >= pg_offset)
-					break;
-			} while ((bh = bh->b_this_page) != head);
-		} else
-			ret = mapped ? 0 : PAGE_CACHE_SIZE;
-	}
+	bh = head = page_buffers(page);
+	do {
+		if (!buffer_uptodate(bh))
+			break;
+		if (!buffer_mapped(bh))
+			break;
+		ret += bh->b_size;
+		if (ret >= pg_offset)
+			break;
+	} while ((bh = bh->b_this_page) != head);
 
 	return ret;
 }
@@ -648,8 +647,7 @@ xfs_probe_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
-	struct buffer_head	*head,
-	int			mapped)
+	struct buffer_head	*head)
 {
 	struct pagevec		pvec;
 	pgoff_t			tindex, tlast, tloff;
@@ -658,7 +656,7 @@ xfs_probe_cluster(
 
 	/* First sum forwards in this page */
 	do {
-		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+		if (!buffer_uptodate(bh) || !buffer_mapped(bh))
 			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
@@ -692,7 +690,7 @@ xfs_probe_cluster(
 				pg_offset = PAGE_CACHE_SIZE;
 
 			if (page->index == tindex && trylock_page(page)) {
-				pg_len = xfs_probe_page(page, pg_offset, mapped);
+				pg_len = xfs_probe_page(page, pg_offset);
 				unlock_page(page);
 			}
 
@@ -761,7 +759,6 @@ xfs_convert_page(
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	int			startio,
 	int			all_bh)
 {
 	struct buffer_head	*bh, *head;
@@ -832,19 +829,14 @@ xfs_convert_page(
 			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
 			xfs_map_at_offset(inode, bh, imap, offset);
-			if (startio) {
-				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
-			} else {
-				set_buffer_dirty(bh);
-				unlock_buffer(bh);
-				mark_buffer_dirty(bh);
-			}
+			xfs_add_to_ioend(inode, bh, offset, type,
+					 ioendp, done);
+
 			page_dirty--;
 			count++;
 		} else {
 			type = IO_NEW;
-			if (buffer_mapped(bh) && all_bh && startio) {
+			if (buffer_mapped(bh) && all_bh) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, offset,
 						type, ioendp, done);
@@ -859,14 +851,12 @@ xfs_convert_page(
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 
-	if (startio) {
-		if (count) {
-			wbc->nr_to_write--;
-			if (wbc->nr_to_write <= 0)
-				done = 1;
-		}
-		xfs_start_page_writeback(page, !page_dirty, count);
+	if (count) {
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE)
+			done = 1;
 	}
+	xfs_start_page_writeback(page, !page_dirty, count);
 
 	return done;
  fail_unlock_page:
@@ -886,7 +876,6 @@ xfs_cluster_write(
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	int			startio,
 	int			all_bh,
 	pgoff_t			tlast)
 {
@@ -902,7 +891,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc, startio, all_bh);
+					imap, ioendp, wbc, all_bh);
 			if (done)
 				break;
 		}
@@ -981,7 +970,7 @@ xfs_aops_discard_page(
 		 */
 		error = xfs_bmapi(NULL, ip, offset_fsb, 1,
 				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-				&nimaps, NULL, NULL);
+				&nimaps, NULL);
 
 		if (error) {
 			/* something screwed, just bail */
@@ -1009,7 +998,7 @@ xfs_aops_discard_page(
 		 */
 		xfs_bmap_init(&flist, &firstblock);
 		error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-					&flist, NULL, &done);
+					&flist, &done);
 
 		ASSERT(!flist.xbf_count && !flist.xbf_first);
 		if (error) {
@@ -1032,50 +1021,66 @@ out_invalidate:
 }
 
 /*
- * Calling this without startio set means we are being asked to make a dirty
- * page ready for freeing it's buffers.  When called with startio set then
- * we are coming from writepage.
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
  *
- * When called with startio set it is important that we write the WHOLE
- * page if possible.
- * The bh->b_state's cannot know if any of the blocks or which block for
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
- * only valid if the page itself isn't completely uptodate.  Some layers
- * may clear the page dirty flag prior to calling write page, under the
- * assumption the entire page will be written out; by not writing out the
- * whole page the page can be reused before all valid dirty data is
- * written out.  Note: in the case of a page that has been dirty'd by
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
+ * If we detect that a transaction would be required to flush the page, we
+ * have to check the process flags first, if we are already in a transaction
+ * or disk I/O during allocations is off, we need to fail the writepage and
+ * redirty the page.
  */
-
 STATIC int
-xfs_page_state_convert(
-	struct inode	*inode,
-	struct page	*page,
-	struct writeback_control *wbc,
-	int		startio,
-	int		unmapped) /* also implies page uptodate */
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
 {
+	struct inode		*inode = page->mapping->host;
+	int			delalloc, unwritten;
 	struct buffer_head	*bh, *head;
 	struct xfs_bmbt_irec	imap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
-	unsigned long           p_offset = 0;
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			size, len;
 	int			flags, err, imap_valid = 0, uptodate = 1;
-	int			page_dirty, count = 0;
-	int			trylock = 0;
-	int			all_bh = unmapped;
+	int			count = 0;
+	int			all_bh = 0;
 
-	if (startio) {
-		if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
-			trylock |= BMAPI_TRYLOCK;
-	}
+	trace_xfs_writepage(inode, page, 0);
+
+	ASSERT(page_has_buffers(page));
+
+	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This avoids stack overflows when called from deeply used stacks in
+	 * random callers for direct reclaim or memcg reclaim.  We explicitly
+	 * allow reclaim from kswapd as the stack usage there is relatively low.
+	 *
+	 * This should really be done by the core VM, but until that happens
+	 * filesystems like XFS, btrfs and ext4 have to take care of this
+	 * by themselves.
+	 */
+	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+		goto redirty;
+
+	/*
+	 * We need a transaction if there are delalloc or unwritten buffers
+	 * on the page.
+	 *
+	 * If we need a transaction and the process flags say we are already
+	 * in a transaction, or no IO is allowed then mark the page dirty
+	 * again and leave the page as is.
+	 */
+	xfs_count_page_state(page, &delalloc, &unwritten);
+	if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+		goto redirty;
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -1084,50 +1089,33 @@ xfs_page_state_convert(
 	if (page->index >= end_index) {
 		if ((page->index >= end_index + 1) ||
 		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-			if (startio)
-				unlock_page(page);
+			unlock_page(page);
 			return 0;
 		}
 	}
 
-	/*
-	 * page_dirty is initially a count of buffers on the page before
-	 * EOF and is decremented as we move each into a cleanable state.
-	 *
-	 * Derivation:
-	 *
-	 * End offset is the highest offset that this page should represent.
-	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-	 * hence give us the correct page_dirty count. On any other page,
-	 * it will be zero and in that case we need page_dirty to be the
-	 * count of buffers on the page.
- 	 */
 	end_offset = min_t(unsigned long long,
-			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			offset);
 	len = 1 << inode->i_blkbits;
-	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-					PAGE_CACHE_SIZE);
-	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-	page_dirty = p_offset / len;
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	flags = BMAPI_READ;
 	type = IO_NEW;
 
-	/* TODO: cleanup count and page_dirty */
-
 	do {
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
-			/*
-			 * the iomap is actually still valid, but the ioend
-			 * isn't.  shouldn't happen too often.
-			 */
+
+		/*
+		 * A hole may still be marked uptodate because discard_buffer
+		 * leaves the flag set.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			ASSERT(!buffer_dirty(bh));
 			imap_valid = 0;
 			continue;
 		}
@@ -1135,19 +1123,7 @@ xfs_page_state_convert(
 		if (imap_valid)
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
 
-		/*
-		 * First case, map an unwritten extent and prepare for
-		 * extent state conversion transaction on completion.
-		 *
-		 * Second case, allocate space for a delalloc buffer.
-		 * We can return EAGAIN here in the release page case.
-		 *
-		 * Third case, an unmapped buffer was found, and we are
-		 * in a path where we need to write the whole page out.
-		 */
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
-		     !buffer_mapped(bh) && (unmapped || startio))) {
+		if (buffer_unwritten(bh) || buffer_delay(bh)) {
 			int new_ioend = 0;
 
 			/*
@@ -1161,15 +1137,15 @@ xfs_page_state_convert(
 				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
 				type = IO_DELAY;
-				flags = BMAPI_ALLOCATE | trylock;
-			} else {
-				type = IO_NEW;
-				flags = BMAPI_WRITE | BMAPI_MMAP;
+				flags = BMAPI_ALLOCATE;
+
+				if (wbc->sync_mode == WB_SYNC_NONE)
+					flags |= BMAPI_TRYLOCK;
 			}
 
 			if (!imap_valid) {
 				/*
-				 * if we didn't have a valid mapping then we
+				 * If we didn't have a valid mapping then we
 				 * need to ensure that we put the new mapping
 				 * in a new ioend structure. This needs to be
 				 * done to ensure that the ioends correctly
@@ -1177,14 +1153,7 @@ xfs_page_state_convert(
 				 * for unwritten extent conversion.
 				 */
 				new_ioend = 1;
-				if (type == IO_NEW) {
-					size = xfs_probe_cluster(inode,
-							page, bh, head, 0);
-				} else {
-					size = len;
-				}
-
-				err = xfs_map_blocks(inode, offset, size,
+				err = xfs_map_blocks(inode, offset, len,
 						&imap, flags);
 				if (err)
 					goto error;
@@ -1193,19 +1162,11 @@ xfs_page_state_convert(
 			}
 			if (imap_valid) {
 				xfs_map_at_offset(inode, bh, &imap, offset);
-				if (startio) {
-					xfs_add_to_ioend(inode, bh, offset,
-							type, &ioend,
-							new_ioend);
-				} else {
-					set_buffer_dirty(bh);
-					unlock_buffer(bh);
-					mark_buffer_dirty(bh);
-				}
-				page_dirty--;
+				xfs_add_to_ioend(inode, bh, offset, type,
+						 &ioend, new_ioend);
 				count++;
 			}
-		} else if (buffer_uptodate(bh) && startio) {
+		} else if (buffer_uptodate(bh)) {
 			/*
 			 * we got here because the buffer is already mapped.
 			 * That means it must already have extents allocated
@@ -1213,8 +1174,7 @@ xfs_page_state_convert(
 			 */
 			if (!imap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
-				size = xfs_probe_cluster(inode, page, bh,
-								head, 1);
+				size = xfs_probe_cluster(inode, page, bh, head);
 				err = xfs_map_blocks(inode, offset, size,
 						&imap, flags);
 				if (err)
@@ -1233,18 +1193,16 @@ xfs_page_state_convert(
 			 */
 			type = IO_NEW;
 			if (trylock_buffer(bh)) {
-				ASSERT(buffer_mapped(bh));
 				if (imap_valid)
 					all_bh = 1;
 				xfs_add_to_ioend(inode, bh, offset, type,
 						&ioend, !imap_valid);
-				page_dirty--;
 				count++;
 			} else {
 				imap_valid = 0;
 			}
-		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
-			   (unmapped || startio)) {
+		} else if (PageUptodate(page)) {
+			ASSERT(buffer_mapped(bh));
 			imap_valid = 0;
 		}
 
@@ -1256,8 +1214,7 @@ xfs_page_state_convert(
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 
-	if (startio)
-		xfs_start_page_writeback(page, 1, count);
+	xfs_start_page_writeback(page, 1, count);
 
 	if (ioend && imap_valid) {
 		xfs_off_t		end_index;
@@ -1275,131 +1232,30 @@ xfs_page_state_convert(
 			end_index = last_index;
 
 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-					wbc, startio, all_bh, end_index);
+					wbc, all_bh, end_index);
 	}
 
 	if (iohead)
 		xfs_submit_ioend(wbc, iohead);
 
-	return page_dirty;
+	return 0;
 
 error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 
-	/*
-	 * If it's delalloc and we have nowhere to put it,
-	 * throw it away, unless the lower layers told
-	 * us to try again.
-	 */
-	if (err != -EAGAIN) {
-		if (!unmapped)
-			xfs_aops_discard_page(page);
-		ClearPageUptodate(page);
-	}
-	return err;
-}
+	if (err == -EAGAIN)
+		goto redirty;
 
-/*
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- *    state is cleared before we get here. In this case is it
- *    conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-
-STATIC int
-xfs_vm_writepage(
-	struct page		*page,
-	struct writeback_control *wbc)
-{
-	int			error;
-	int			need_trans;
-	int			delalloc, unmapped, unwritten;
-	struct inode		*inode = page->mapping->host;
-
-	trace_xfs_writepage(inode, page, 0);
-
-	/*
-	 * Refuse to write the page out if we are called from reclaim context.
-	 *
-	 * This is primarily to avoid stack overflows when called from deep
-	 * used stacks in random callers for direct reclaim, but disabling
-	 * reclaim for kswap is a nice side-effect as kswapd causes rather
-	 * suboptimal I/O patters, too.
-	 *
-	 * This should really be done by the core VM, but until that happens
-	 * filesystems like XFS, btrfs and ext4 have to take care of this
-	 * by themselves.
-	 */
-	if (current->flags & PF_MEMALLOC)
-		goto out_fail;
-
-	/*
-	 * We need a transaction if:
-	 *  1. There are delalloc buffers on the page
-	 *  2. The page is uptodate and we have unmapped buffers
-	 *  3. The page is uptodate and we have no buffers
-	 *  4. There are unwritten buffers on the page
-	 */
-
-	if (!page_has_buffers(page)) {
-		unmapped = 1;
-		need_trans = 1;
-	} else {
-		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-		if (!PageUptodate(page))
-			unmapped = 0;
-		need_trans = delalloc + unmapped + unwritten;
-	}
-
-	/*
-	 * If we need a transaction and the process flags say
-	 * we are already in a transaction, or no IO is allowed
-	 * then mark the page dirty again and leave the page
-	 * as is.
-	 */
-	if (current_test_flags(PF_FSTRANS) && need_trans)
-		goto out_fail;
-
-	/*
-	 * Delay hooking up buffer heads until we have
-	 * made our go/no-go decision.
-	 */
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
-	/*
-	 * Convert delayed allocate, unwritten or unmapped space
-	 * to real space and flush out to disk.
-	 */
-	error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
-	if (error == -EAGAIN)
-		goto out_fail;
-	if (unlikely(error < 0))
-		goto out_unlock;
-
-	return 0;
+	xfs_aops_discard_page(page);
+	ClearPageUptodate(page);
+	unlock_page(page);
+	return err;
 
-out_fail:
+redirty:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return 0;
-out_unlock:
-	unlock_page(page);
-	return error;
 }
 
 STATIC int
@@ -1413,65 +1269,27 @@ xfs_vm_writepages(
 
 /*
  * Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
+ * to be released. The page should already be clean. We always
  * have buffer heads in this call.
  *
- * Returns 0 if the page is ok to release, 1 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- *    to via regular I/O. buffer heads will be dirty and possibly
- *    delalloc. If no delalloc buffer heads in this case then we
- *    can just return zero.
- *
- * 2. We are called to release a page which has been written via
- *    mmap, all we need to do is ensure there is no delalloc
- *    state in the buffer heads, if not we can let the caller
- *    free them and we should come back later via writepage.
+ * Returns 1 if the page is ok to release, 0 otherwise.
  */
 STATIC int
 xfs_vm_releasepage(
 	struct page		*page,
 	gfp_t			gfp_mask)
 {
-	struct inode		*inode = page->mapping->host;
-	int			dirty, delalloc, unmapped, unwritten;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 1,
-	};
+	int			delalloc, unwritten;
 
-	trace_xfs_releasepage(inode, page, 0);
+	trace_xfs_releasepage(page->mapping->host, page, 0);
 
-	if (!page_has_buffers(page))
-		return 0;
-
-	xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-	if (!delalloc && !unwritten)
-		goto free_buffers;
+	xfs_count_page_state(page, &delalloc, &unwritten);
 
-	if (!(gfp_mask & __GFP_FS))
+	if (WARN_ON(delalloc))
 		return 0;
-
-	/* If we are already inside a transaction or the thread cannot
-	 * do I/O, we cannot release this page.
-	 */
-	if (current_test_flags(PF_FSTRANS))
+	if (WARN_ON(unwritten))
 		return 0;
 
-	/*
-	 * Convert delalloc space to real space, do not flush the
-	 * data out to disk, that will be done by the caller.
-	 * Never need to allocate space here - we will always
-	 * come back to writepage in that case.
-	 */
-	dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
-	if (dirty == 0 && !unwritten)
-		goto free_buffers;
-	return 0;
-
-free_buffers:
 	return try_to_free_buffers(page);
 }
 
@@ -1481,9 +1299,9 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	int			direct,
-	bmapi_flags_t		flags)
+	int			direct)
 {
+	int			flags = create ? BMAPI_WRITE : BMAPI_READ;
 	struct xfs_bmbt_irec	imap;
 	xfs_off_t		offset;
 	ssize_t			size;
@@ -1498,8 +1316,11 @@ __xfs_get_blocks(
 	if (!create && direct && offset >= i_size_read(inode))
 		return 0;
 
-	error = xfs_iomap(XFS_I(inode), offset, size,
-			     create ? flags : BMAPI_READ, &imap, &nimap, &new);
+	if (direct && create)
+		flags |= BMAPI_DIRECT;
+
+	error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+			  &new);
 	if (error)
 		return -error;
 	if (nimap == 0)
@@ -1579,8 +1400,7 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock,
-				bh_result, create, 0, BMAPI_WRITE);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
 }
 
 STATIC int
@@ -1590,61 +1410,59 @@ xfs_get_blocks_direct(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock,
-				bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
 }
 
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successfull AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
 STATIC void
-xfs_end_io_direct(
-	struct kiocb	*iocb,
-	loff_t		offset,
-	ssize_t		size,
-	void		*private)
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
+	loff_t			offset,
+	ssize_t			size,
+	void			*private,
+	int			ret,
+	bool			is_async)
 {
-	xfs_ioend_t	*ioend = iocb->private;
+	struct xfs_ioend	*ioend = iocb->private;
 
 	/*
-	 * Non-NULL private data means we need to issue a transaction to
-	 * convert a range from unwritten to written extents.  This needs
-	 * to happen from process context but aio+dio I/O completion
-	 * happens from irq context so we need to defer it to a workqueue.
-	 * This is not necessary for synchronous direct I/O, but we do
-	 * it anyway to keep the code uniform and simpler.
-	 *
-	 * Well, if only it were that simple. Because synchronous direct I/O
-	 * requires extent conversion to occur *before* we return to userspace,
-	 * we have to wait for extent conversion to complete. Look at the
-	 * iocb that has been passed to us to determine if this is AIO or
-	 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
-	 * workqueue and wait for it to complete.
-	 *
-	 * The core direct I/O code might be changed to always call the
-	 * completion handler in the future, in which case all this can
-	 * go away.
+	 * blockdev_direct_IO can return an error even after the I/O
+	 * completion handler was called.  Thus we need to protect
+	 * against double-freeing.
 	 */
+	iocb->private = NULL;
+
 	ioend->io_offset = offset;
 	ioend->io_size = size;
-	if (ioend->io_type == IO_READ) {
-		xfs_finish_ioend(ioend, 0);
-	} else if (private && size > 0) {
-		xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
-	} else {
+	if (private && size > 0)
+		ioend->io_type = IO_UNWRITTEN;
+
+	if (is_async) {
 		/*
-		 * A direct I/O write ioend starts it's life in unwritten
-		 * state in case they map an unwritten extent.  This write
-		 * didn't map an unwritten extent so switch it's completion
-		 * handler.
+		 * If we are converting an unwritten extent we need to delay
+		 * the AIO completion until after the unwrittent extent
+		 * conversion has completed, otherwise do it ASAP.
 		 */
-		ioend->io_type = IO_NEW;
-		xfs_finish_ioend(ioend, 0);
+		if (ioend->io_type == IO_UNWRITTEN) {
+			ioend->io_iocb = iocb;
+			ioend->io_result = ret;
+		} else {
+			aio_complete(iocb, ret, 0);
+		}
+		xfs_finish_ioend(ioend);
+	} else {
+		xfs_finish_ioend_sync(ioend);
 	}
-
-	/*
-	 * blockdev_direct_IO can return an error even after the I/O
-	 * completion handler was called.  Thus we need to protect
-	 * against double-freeing.
-	 */
-	iocb->private = NULL;
 }
 
 STATIC ssize_t
@@ -1655,26 +1473,45 @@ xfs_vm_direct_IO(
 	loff_t			offset,
 	unsigned long		nr_segs)
 {
-	struct file	*file = iocb->ki_filp;
-	struct inode	*inode = file->f_mapping->host;
-	struct block_device *bdev;
-	ssize_t		ret;
-
-	bdev = xfs_find_bdev_for_inode(inode);
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+	ssize_t			ret;
 
-	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-					IO_UNWRITTEN : IO_READ);
+	if (rw & WRITE) {
+		iocb->private = xfs_alloc_ioend(inode, IO_NEW);
 
-	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
 					    xfs_get_blocks_direct,
-					    xfs_end_io_direct);
+					    xfs_end_io_direct_write, NULL, 0);
+		if (ret != -EIOCBQUEUED && iocb->private)
+			xfs_destroy_ioend(iocb->private);
+	} else {
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    NULL, NULL, 0);
+	}
 
-	if (unlikely(ret != -EIOCBQUEUED && iocb->private))
-		xfs_destroy_ioend(iocb->private);
 	return ret;
 }
 
+STATIC void
+xfs_vm_write_failed(
+	struct address_space	*mapping,
+	loff_t			to)
+{
+	struct inode		*inode = mapping->host;
+
+	if (to > inode->i_size) {
+		struct iattr	ia = {
+			.ia_valid	= ATTR_SIZE | ATTR_FORCE,
+			.ia_size	= inode->i_size,
+		};
+		xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+	}
+}
+
 STATIC int
 xfs_vm_write_begin(
 	struct file		*file,
@@ -1685,9 +1522,31 @@ xfs_vm_write_begin(
 	struct page		**pagep,
 	void			**fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-								xfs_get_blocks);
+	int			ret;
+
+	ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+				pagep, xfs_get_blocks);
+	if (unlikely(ret))
+		xfs_vm_write_failed(mapping, pos + len);
+	return ret;
+}
+
+STATIC int
+xfs_vm_write_end(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		copied,
+	struct page		*page,
+	void			*fsdata)
+{
+	int			ret;
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (unlikely(ret < len))
+		xfs_vm_write_failed(mapping, pos + len);
+	return ret;
 }
 
 STATIC sector_t
@@ -1698,7 +1557,7 @@ xfs_vm_bmap(
 	struct inode		*inode = (struct inode *)mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 
-	xfs_itrace_entry(XFS_I(inode));
+	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1732,7 +1591,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,
-	.write_end		= generic_write_end,
+	.write_end		= xfs_vm_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df..c5057fb6237 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct kiocb		*io_iocb;
+	int			io_result;
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_ioend_init(void);
 extern void xfs_ioend_wait(struct xfs_inode *);
 
-extern void xfs_count_page_state(struct page *, int *, int *, int *);
+extern void xfs_count_page_state(struct page *, int *, int *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2ee3f7a6016..63fd2c07cb5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -39,7 +39,6 @@
 #include "xfs_inum.h"
 #include "xfs_log.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
 
@@ -189,8 +188,8 @@ _xfs_buf_initialize(
 	atomic_set(&bp->b_hold, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_list);
-	INIT_LIST_HEAD(&bp->b_hash_list);
-	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+	RB_CLEAR_NODE(&bp->b_rbnode);
+	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
 	bp->b_file_offset = range_base;
@@ -263,8 +262,6 @@ xfs_buf_free(
 {
 	trace_xfs_buf_free(bp, _RET_IP_);
 
-	ASSERT(list_empty(&bp->b_hash_list));
-
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
@@ -423,8 +420,10 @@ _xfs_buf_find(
 {
 	xfs_off_t		range_base;
 	size_t			range_length;
-	xfs_bufhash_t		*hash;
-	xfs_buf_t		*bp, *n;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	xfs_buf_t		*bp;
 
 	range_base = (ioff << BBSHIFT);
 	range_length = (isize << BBSHIFT);
@@ -433,20 +432,38 @@ _xfs_buf_find(
 	ASSERT(!(range_length < (1 << btp->bt_sshift)));
 	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
-	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-	spin_lock(&hash->bh_lock);
-
-	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-		ASSERT(btp == bp->b_target);
-		if (bp->b_file_offset == range_base &&
-		    bp->b_buffer_length == range_length) {
+	/* get tree root */
+	pag = xfs_perag_get(btp->bt_mount,
+				xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+	/* walk tree */
+	spin_lock(&pag->pag_buf_lock);
+	rbp = &pag->pag_buf_tree.rb_node;
+	parent = NULL;
+	bp = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+		if (range_base < bp->b_file_offset)
+			rbp = &(*rbp)->rb_left;
+		else if (range_base > bp->b_file_offset)
+			rbp = &(*rbp)->rb_right;
+		else {
 			/*
-			 * If we look at something, bring it to the
-			 * front of the list for next time.
+			 * found a block offset match. If the range doesn't
+			 * match, the only way this is allowed is if the buffer
+			 * in the cache is stale and the transaction that made
+			 * it stale has not yet committed. i.e. we are
+			 * reallocating a busy extent. Skip this buffer and
+			 * continue searching to the right for an exact match.
 			 */
+			if (bp->b_buffer_length != range_length) {
+				ASSERT(bp->b_flags & XBF_STALE);
+				rbp = &(*rbp)->rb_right;
+				continue;
+			}
 			atomic_inc(&bp->b_hold);
-			list_move(&bp->b_hash_list, &hash->bh_list);
 			goto found;
 		}
 	}
@@ -455,17 +472,21 @@ _xfs_buf_find(
 	if (new_bp) {
 		_xfs_buf_initialize(new_bp, btp, range_base,
 				range_length, flags);
-		new_bp->b_hash = hash;
-		list_add(&new_bp->b_hash_list, &hash->bh_list);
+		rb_link_node(&new_bp->b_rbnode, parent, rbp);
+		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+		/* the buffer keeps the perag reference until it is freed */
+		new_bp->b_pag = pag;
+		spin_unlock(&pag->pag_buf_lock);
 	} else {
 		XFS_STATS_INC(xb_miss_locked);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
 	}
-
-	spin_unlock(&hash->bh_lock);
 	return new_bp;
 
 found:
-	spin_unlock(&hash->bh_lock);
+	spin_unlock(&pag->pag_buf_lock);
+	xfs_perag_put(pag);
 
 	/* Attempt to get the semaphore without sleeping,
 	 * if this does not work then we need to drop the
@@ -579,9 +600,9 @@ _xfs_buf_read(
 			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
 
 	status = xfs_buf_iorequest(bp);
-	if (!status && !(flags & XBF_ASYNC))
-		status = xfs_buf_iowait(bp);
-	return status;
+	if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+		return status;
+	return xfs_buf_iowait(bp);
 }
 
 xfs_buf_t *
@@ -631,8 +652,7 @@ void
 xfs_buf_readahead(
 	xfs_buftarg_t		*target,
 	xfs_off_t		ioff,
-	size_t			isize,
-	xfs_buf_flags_t		flags)
+	size_t			isize)
 {
 	struct backing_dev_info *bdi;
 
@@ -640,8 +660,42 @@ xfs_buf_readahead(
 	if (bdi_read_congested(bdi))
 		return;
 
-	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-	xfs_buf_read(target, ioff, isize, flags);
+	xfs_buf_read(target, ioff, isize,
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			length,
+	int			flags)
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	bp = xfs_buf_get_uncached(target, length, flags);
+	if (!bp)
+		return NULL;
+
+	/* set up the buffer for a read IO */
+	xfs_buf_lock(bp);
+	XFS_BUF_SET_ADDR(bp, daddr);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+
+	xfsbdstrat(mp, bp);
+	error = xfs_buf_iowait(bp);
+	if (error || bp->b_error) {
+		xfs_buf_relse(bp);
+		return NULL;
+	}
+	return bp;
 }
 
 xfs_buf_t *
@@ -713,9 +767,10 @@ xfs_buf_associate_memory(
 }
 
 xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+	struct xfs_buftarg	*target,
 	size_t			len,
-	xfs_buftarg_t		*target)
+	int			flags)
 {
 	unsigned long		page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 	int			error, i;
@@ -731,7 +786,7 @@ xfs_buf_get_noaddr(
 		goto fail_free_buf;
 
 	for (i = 0; i < page_count; i++) {
-		bp->b_pages[i] = alloc_page(GFP_KERNEL);
+		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
 		if (!bp->b_pages[i])
 			goto fail_free_mem;
 	}
@@ -746,7 +801,7 @@ xfs_buf_get_noaddr(
 
 	xfs_buf_unlock(bp);
 
-	trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+	trace_xfs_buf_get_uncached(bp, _RET_IP_);
 	return bp;
 
  fail_free_mem:
@@ -780,29 +835,30 @@ void
 xfs_buf_rele(
 	xfs_buf_t		*bp)
 {
-	xfs_bufhash_t		*hash = bp->b_hash;
+	struct xfs_perag	*pag = bp->b_pag;
 
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	if (unlikely(!hash)) {
+	if (!pag) {
 		ASSERT(!bp->b_relse);
+		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
 			xfs_buf_free(bp);
 		return;
 	}
 
+	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 	ASSERT(atomic_read(&bp->b_hold) > 0);
-	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
-			spin_unlock(&hash->bh_lock);
-			(*(bp->b_relse)) (bp);
-		} else if (bp->b_flags & XBF_FS_MANAGED) {
-			spin_unlock(&hash->bh_lock);
+			spin_unlock(&pag->pag_buf_lock);
+			bp->b_relse(bp);
 		} else {
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-			list_del_init(&bp->b_hash_list);
-			spin_unlock(&hash->bh_lock);
+			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+			spin_unlock(&pag->pag_buf_lock);
+			xfs_perag_put(pag);
 			xfs_buf_free(bp);
 		}
 	}
@@ -865,7 +921,7 @@ xfs_buf_lock(
 	trace_xfs_buf_lock(bp, _RET_IP_);
 
 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-		xfs_log_force(bp->b_mount, 0);
+		xfs_log_force(bp->b_target->bt_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	down(&bp->b_sema);
@@ -897,36 +953,6 @@ xfs_buf_unlock(
 	trace_xfs_buf_unlock(bp, _RET_IP_);
 }
 
-
-/*
- *	Pinning Buffer Storage in Memory
- *	Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
-	xfs_buf_t		*bp)
-{
-	trace_xfs_buf_pin(bp, _RET_IP_);
-	atomic_inc(&bp->b_pin_count);
-}
-
-void
-xfs_buf_unpin(
-	xfs_buf_t		*bp)
-{
-	trace_xfs_buf_unpin(bp, _RET_IP_);
-
-	if (atomic_dec_and_test(&bp->b_pin_count))
-		wake_up_all(&bp->b_waiters);
-}
-
-int
-xfs_buf_ispin(
-	xfs_buf_t		*bp)
-{
-	return atomic_read(&bp->b_pin_count);
-}
-
 STATIC void
 xfs_buf_wait_unpin(
 	xfs_buf_t		*bp)
@@ -960,19 +986,7 @@ xfs_buf_iodone_work(
 	xfs_buf_t		*bp =
 		container_of(work, xfs_buf_t, b_iodone_work);
 
-	/*
-	 * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-	 * ordered flag and reissue them.  Because we can't tell the higher
-	 * layers directly that they should not issue ordered I/O anymore, they
-	 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-	 */
-	if ((bp->b_error == EOPNOTSUPP) &&
-	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-		trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-		bp->b_flags &= ~XBF_ORDERED;
-		bp->b_flags |= _XFS_BARRIER_FAILED;
-		xfs_buf_iorequest(bp);
-	} else if (bp->b_iodone)
+	if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
@@ -1018,13 +1032,11 @@ xfs_bwrite(
 {
 	int			error;
 
-	bp->b_strat = xfs_bdstrat_cb;
-	bp->b_mount = mp;
 	bp->b_flags |= XBF_WRITE;
 	bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
 
 	xfs_buf_delwri_dequeue(bp);
-	xfs_buf_iostrategy(bp);
+	xfs_bdstrat_cb(bp);
 
 	error = xfs_buf_iowait(bp);
 	if (error)
@@ -1040,9 +1052,6 @@ xfs_bdwrite(
 {
 	trace_xfs_buf_bdwrite(bp, _RET_IP_);
 
-	bp->b_strat = xfs_bdstrat_cb;
-	bp->b_mount = mp;
-
 	bp->b_flags &= ~XBF_READ;
 	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
 
@@ -1051,7 +1060,7 @@ xfs_bdwrite(
 
 /*
  * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
  * so that the proper iodone callbacks get called.
  */
 STATIC int
@@ -1068,22 +1077,21 @@ xfs_bioerror(
 	XFS_BUF_ERROR(bp, EIO);
 
 	/*
-	 * We're calling biodone, so delete XBF_DONE flag.
+	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
 	 */
 	XFS_BUF_UNREAD(bp);
 	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_UNDONE(bp);
 	XFS_BUF_STALE(bp);
 
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 
 	return EIO;
 }
 
 /*
  * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
  * This is meant for userdata errors; metadata bufs come with
  * iodone functions attached, so that we can track down errors.
  */
@@ -1105,7 +1113,6 @@ xfs_bioerror_relse(
 	XFS_BUF_DONE(bp);
 	XFS_BUF_STALE(bp);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
 	if (!(fl & XBF_ASYNC)) {
 		/*
 		 * Mark b_error and B_ERROR _both_.
@@ -1133,7 +1140,7 @@ int
 xfs_bdstrat_cb(
 	struct xfs_buf	*bp)
 {
-	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 		trace_xfs_bdstrat_shut(bp, _RET_IP_);
 		/*
 		 * Metadata write that didn't get logged but
@@ -1235,7 +1242,7 @@ _xfs_buf_ioapply(
 
 	if (bp->b_flags & XBF_ORDERED) {
 		ASSERT(!(bp->b_flags & XBF_READ));
-		rw = WRITE_BARRIER;
+		rw = WRITE_FLUSH_FUA;
 	} else if (bp->b_flags & XBF_LOG_BUFFER) {
 		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
 		bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1311,8 +1318,19 @@ submit_io:
 		if (size)
 			goto next_chunk;
 	} else {
-		bio_put(bio);
+		/*
+		 * if we get here, no pages were added to the bio. However,
+		 * we can't just error out here - if the pages are locked then
+		 * we have to unlock them otherwise we can hang on a later
+		 * access to the page.
+		 */
 		xfs_buf_ioerror(bp, EIO);
+		if (bp->b_flags & _XBF_PAGE_LOCKED) {
+			int i;
+			for (i = 0; i < bp->b_page_count; i++)
+				unlock_page(bp->b_pages[i]);
+		}
+		bio_put(bio);
 	}
 }
 
@@ -1428,63 +1446,24 @@ xfs_buf_iomove(
  */
 void
 xfs_wait_buftarg(
-	xfs_buftarg_t	*btp)
-{
-	xfs_buf_t	*bp, *n;
-	xfs_bufhash_t	*hash;
-	uint		i;
-
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		hash = &btp->bt_hash[i];
-again:
-		spin_lock(&hash->bh_lock);
-		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-			ASSERT(btp == bp->b_target);
-			if (!(bp->b_flags & XBF_FS_MANAGED)) {
-				spin_unlock(&hash->bh_lock);
-				/*
-				 * Catch superblock reference count leaks
-				 * immediately
-				 */
-				BUG_ON(bp->b_bn == 0);
-				delay(100);
-				goto again;
-			}
-		}
-		spin_unlock(&hash->bh_lock);
-	}
-}
-
-/*
- *	Allocate buffer hash table for a given target.
- *	For devices containing metadata (i.e. not the log/realtime devices)
- *	we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-	xfs_buftarg_t		*btp,
-	int			external)
+	struct xfs_buftarg	*btp)
 {
-	unsigned int		i;
+	struct xfs_perag	*pag;
+	uint			i;
 
-	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
-	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-					 sizeof(xfs_bufhash_t));
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		spin_lock_init(&btp->bt_hash[i].bh_lock);
-		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+	for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+		pag = xfs_perag_get(btp->bt_mount, i);
+		spin_lock(&pag->pag_buf_lock);
+		while (rb_first(&pag->pag_buf_tree)) {
+			spin_unlock(&pag->pag_buf_lock);
+			delay(100);
+			spin_lock(&pag->pag_buf_lock);
+		}
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
 	}
 }
 
-STATIC void
-xfs_free_bufhash(
-	xfs_buftarg_t		*btp)
-{
-	kmem_free_large(btp->bt_hash);
-	btp->bt_hash = NULL;
-}
-
 /*
  *	buftarg list for delwrite queue processing
  */
@@ -1517,7 +1496,6 @@ xfs_free_buftarg(
 	xfs_flush_buftarg(btp, 1);
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
-	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
 	/* Unregister the buftarg first so that we don't get a
@@ -1602,6 +1580,7 @@ xfs_mapping_buftarg(
 			XFS_BUFTARG_NAME(btp));
 		return ENOMEM;
 	}
+	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFBLK;
 	inode->i_bdev = bdev;
 	inode->i_rdev = bdev->bd_dev;
@@ -1639,6 +1618,7 @@ out_error:
 
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
 	struct block_device	*bdev,
 	int			external,
 	const char		*fsname)
@@ -1647,6 +1627,7 @@ xfs_alloc_buftarg(
 
 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
 
+	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
 	if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1655,7 +1636,6 @@ xfs_alloc_buftarg(
 		goto error;
 	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
-	xfs_alloc_bufhash(btp, external);
 	return btp;
 
 error:
@@ -1804,7 +1784,7 @@ xfs_buf_delwri_split(
 		trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
-		if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+		if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
 			if (!force &&
 			    time_before(jiffies, bp->b_queuetime + age)) {
 				xfs_buf_unlock(bp);
@@ -1889,7 +1869,7 @@ xfsbufd(
 			struct xfs_buf *bp;
 			bp = list_first_entry(&tmp, struct xfs_buf, b_list);
 			list_del_init(&bp->b_list);
-			xfs_buf_iostrategy(bp);
+			xfs_bdstrat_cb(bp);
 			count++;
 		}
 		if (count)
@@ -1936,7 +1916,7 @@ xfs_flush_buftarg(
 			bp->b_flags &= ~XBF_ASYNC;
 			list_add(&bp->b_list, &wait_list);
 		}
-		xfs_buf_iostrategy(bp);
+		xfs_bdstrat_cb(bp);
 	}
 
 	if (wait) {
@@ -1946,7 +1926,7 @@ xfs_flush_buftarg(
 			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
 			list_del_init(&bp->b_list);
-			xfs_iowait(bp);
+			xfs_buf_iowait(bp);
 			xfs_buf_relse(bp);
 		}
 	}
@@ -1962,7 +1942,8 @@ xfs_buf_init(void)
 	if (!xfs_buf_zone)
 		goto out;
 
-	xfslogd_workqueue = create_workqueue("xfslogd");
+	xfslogd_workqueue = alloc_workqueue("xfslogd",
+					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
 	if (!xfslogd_workqueue)
 		goto out_free_buf_zone;
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 5fbecefa5df..383a3f37cf9 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,48 @@ typedef enum {
 	XBRW_ZERO = 3,			/* Zero target memory */
 } xfs_buf_rw_t;
 
-typedef enum {
-	XBF_READ = (1 << 0),	/* buffer intended for reading from device */
-	XBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
-	XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
-	XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
-	XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate	   */
-	XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
-	XBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
-	XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
- 	XBF_ORDERED = (1 << 11),    /* use ordered writes		   */
-	XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */
-	XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log   */
-
-	/* flags used only as arguments to access routines */
-	XBF_LOCK = (1 << 14),       /* lock requested			   */
-	XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait	   */
-	XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread	   */
-
-	/* flags used only internally */
-	_XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
-	_XBF_PAGES = (1 << 18),	    /* backed by refcounted pages	   */
-	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
-	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
+#define XBF_READ	(1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE	(1 << 1) /* buffer intended for writing to device */
+#define XBF_MAPPED	(1 << 2) /* buffer mapped (b_addr valid) */
+#define XBF_ASYNC	(1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE	(1 << 5) /* all pages in the buffer uptodate */
+#define XBF_DELWRI	(1 << 6) /* buffer has dirty pages */
+#define XBF_STALE	(1 << 7) /* buffer has been staled, do not find it */
+#define XBF_ORDERED	(1 << 11)/* use ordered writes */
+#define XBF_READ_AHEAD	(1 << 12)/* asynchronous read-ahead */
+#define XBF_LOG_BUFFER	(1 << 13)/* this is a buffer used for the log */
+
+/* flags used only as arguments to access routines */
+#define XBF_LOCK	(1 << 14)/* lock requested */
+#define XBF_TRYLOCK	(1 << 15)/* lock requested, but do not wait */
+#define XBF_DONT_BLOCK	(1 << 16)/* do not block in current thread */
+
+/* flags used only internally */
+#define _XBF_PAGE_CACHE	(1 << 17)/* backed by pagecache */
+#define _XBF_PAGES	(1 << 18)/* backed by refcounted pages */
+#define	_XBF_RUN_QUEUES	(1 << 19)/* run block device task queue	*/
+#define _XBF_DELWRI_Q	(1 << 21)/* buffer on delwri queue */
 
-	/*
-	 * Special flag for supporting metadata blocks smaller than a FSB.
-	 *
-	 * In this case we can have multiple xfs_buf_t on a single page and
-	 * need to lock out concurrent xfs_buf_t readers as they only
-	 * serialise access to the buffer.
-	 *
-	 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
-	 * between reads of the page. Hence we can have one thread read the
-	 * page and modify it, but then race with another thread that thinks
-	 * the page is not up-to-date and hence reads it again.
-	 *
-	 * The result is that the first modifcation to the page is lost.
-	 * This sort of AGF/AGI reading race can happen when unlinking inodes
-	 * that require truncation and results in the AGI unlinked list
-	 * modifications being lost.
-	 */
-	_XBF_PAGE_LOCKED = (1 << 22),
+/*
+ * Special flag for supporting metadata blocks smaller than a FSB.
+ *
+ * In this case we can have multiple xfs_buf_t on a single page and
+ * need to lock out concurrent xfs_buf_t readers as they only
+ * serialise access to the buffer.
+ *
+ * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+ * between reads of the page. Hence we can have one thread read the
+ * page and modify it, but then race with another thread that thinks
+ * the page is not up-to-date and hence reads it again.
+ *
+ * The result is that the first modifcation to the page is lost.
+ * This sort of AGF/AGI reading race can happen when unlinking inodes
+ * that require truncation and results in the AGI unlinked list
+ * modifications being lost.
+ */
+#define _XBF_PAGE_LOCKED	(1 << 22)
 
-	/*
-	 * If we try a barrier write, but it fails we have to communicate
-	 * this to the upper layers.  Unfortunately b_error gets overwritten
-	 * when the buffer is re-issued so we have to add another flag to
-	 * keep this information.
-	 */
-	_XFS_BARRIER_FAILED = (1 << 23),
-} xfs_buf_flags_t;
+typedef unsigned int xfs_buf_flags_t;
 
 #define XFS_BUF_FLAGS \
 	{ XBF_READ,		"READ" }, \
@@ -104,7 +95,6 @@ typedef enum {
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_DELWRI,		"DELWRI" }, \
 	{ XBF_STALE,		"STALE" }, \
-	{ XBF_FS_MANAGED,	"FS_MANAGED" }, \
 	{ XBF_ORDERED,		"ORDERED" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
 	{ XBF_LOCK,		"LOCK" },  	/* should never be set */\
@@ -114,8 +104,7 @@ typedef enum {
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_RUN_QUEUES,	"RUN_QUEUES" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }, \
-	{ _XFS_BARRIER_FAILED,	"BARRIER_FAILED" }
+	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }
 
 
 typedef enum {
@@ -132,15 +121,11 @@ typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
 	struct address_space	*bt_mapping;
+	struct xfs_mount	*bt_mount;
 	unsigned int		bt_bsize;
 	unsigned int		bt_sshift;
 	size_t			bt_smask;
 
-	/* per device buffer hash table */
-	uint			bt_hashmask;
-	uint			bt_hashshift;
-	xfs_bufhash_t		*bt_hash;
-
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
 	struct list_head	bt_list;
@@ -168,35 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES	2
 
 typedef struct xfs_buf {
+	/*
+	 * first cacheline holds all the fields needed for an uncontended cache
+	 * hit to be fully processed. The semaphore straddles the cacheline
+	 * boundary, but the counter and lock sits on the first cacheline,
+	 * which is the only bit that is touched if we hit the semaphore
+	 * fast-path on locking.
+	 */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	xfs_off_t		b_file_offset;	/* offset in file */
+	size_t			b_buffer_length;/* size of buffer in bytes */
+	atomic_t		b_hold;		/* reference count */
+	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
-	unsigned long		b_queuetime;	/* time buffer was queued */
-	atomic_t		b_pin_count;	/* pin count */
+
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
-	xfs_buf_flags_t		b_flags;	/* status flags */
-	struct list_head	b_hash_list;	/* hash table list */
-	xfs_bufhash_t		*b_hash;	/* hash table list start */
+	struct xfs_perag	*b_pag;		/* contains rbtree root */
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
-	atomic_t		b_hold;		/* reference count */
 	xfs_daddr_t		b_bn;		/* block number for I/O */
-	xfs_off_t		b_file_offset;	/* offset in file */
-	size_t			b_buffer_length;/* size of buffer in bytes */
 	size_t			b_count_desired;/* desired transfer size */
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_iodone_work;
-	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
-	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
-	struct xfs_mount	*b_mount;
-	unsigned short		b_error;	/* error code on I/O */
-	unsigned int		b_page_count;	/* size of page array */
-	unsigned int		b_offset;	/* page offset in first page */
 	struct page		**b_pages;	/* array of page pointers */
 	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+	unsigned long		b_queuetime;	/* time buffer was queued */
+	atomic_t		b_pin_count;	/* pin count */
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset in first page */
+	unsigned short		b_error;	/* error code on I/O */
 #ifdef XFS_BUF_LOCK_TRACKING
 	int			b_last_holder;
 #endif
@@ -215,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+				struct xfs_buftarg *target,
+				xfs_daddr_t daddr, size_t length, int flags);
 
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
@@ -244,11 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 				xfs_buf_rw_t);
-
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
-	return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
+#define xfs_buf_zero(bp, off, len) \
+	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
@@ -258,11 +248,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
-
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -288,8 +273,6 @@ extern void xfs_buf_terminate(void);
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
-
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
@@ -326,8 +309,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)	((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp)		((bp)->b_strat = NULL)
 
 #define XFS_BUF_FSPRIVATE(bp, type)		((type)(bp)->b_fspriv)
 #define XFS_BUF_SET_FSPRIVATE(bp, val)		((bp)->b_fspriv = (void*)(val))
@@ -351,7 +332,7 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
 #define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
 
-#define XFS_BUF_ISPINNED(bp)	xfs_buf_ispin(bp)
+#define XFS_BUF_ISPINNED(bp)	atomic_read(&((bp)->b_pin_count))
 
 #define XFS_BUF_VALUSEMA(bp)	xfs_buf_lock_value(bp)
 #define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
@@ -370,27 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 	xfs_buf_rele(bp);
 }
 
-#define xfs_bpin(bp)		xfs_buf_pin(bp)
-#define xfs_bunpin(bp)		xfs_buf_unpin(bp)
-#define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
-
-#define xfs_biomove(bp, off, len, data, rw) \
-	    xfs_buf_iomove((bp), (off), (len), (data), \
-		((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
-
-#define xfs_biozero(bp, off, len) \
-	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-
-#define xfs_iowait(bp)	xfs_buf_iowait(bp)
-
-#define xfs_baread(target, rablkno, ralen)  \
-	xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
-
-
 /*
  *	Handling of buftargs.
  */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+			struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b609..00000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-
-#include <linux/capability.h>
-
-/*
- * Credentials
- */
-typedef const struct cred cred_t;
-
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685ee..00000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_PRIV_H__
-#define __XFS_DMAPI_PRIV_H__
-
-/*
- *	Based on IO_ISDIRECT, decide which i_ flag is set.
- */
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      DM_FLAGS_IMUX : 0)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-
-#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index e7839ee49e4..3764d74790e 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
@@ -132,8 +132,7 @@ xfs_nfs_get_inode(
 	 * fine and not an indication of a corrupted filesystem as clients can
 	 * send invalid file handles and we have to handle it gracefully..
 	 */
-	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED,
-			 XFS_ILOCK_SHARED, &ip);
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
 	if (error) {
 		/*
 		 * EINVAL means the inode cluster doesn't exist anymore.
@@ -148,11 +147,10 @@ xfs_nfs_get_inode(
 	}
 
 	if (ip->i_d.di_gen != generation) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		IRELE(ip);
 		return ERR_PTR(-ENOENT);
 	}
 
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return VFS_I(ip);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 257a56b127c..ba8ad422a16 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_ioctl.h"
@@ -108,7 +100,7 @@ xfs_file_fsync(
 	int			error = 0;
 	int			log_flushed = 0;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_file_fsync(ip);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -XFS_ERROR(EIO);
@@ -166,8 +158,7 @@ xfs_file_fsync(
 		 * transaction.	 So we play it safe and fire off the
 		 * transaction anyway.
 		 */
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		xfs_trans_set_sync(tp);
 		error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -275,20 +266,6 @@ xfs_file_aio_read(
 		mutex_lock(&inode->i_mutex);
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-		int iolock = XFS_IOLOCK_SHARED;
-
-		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
-					dmflags, &iolock);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			if (unlikely(ioflags & IO_ISDIRECT))
-				mutex_unlock(&inode->i_mutex);
-			return ret;
-		}
-	}
-
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		if (inode->i_mapping->nrpages) {
 			ret = -xfs_flushinval_pages(ip,
@@ -321,7 +298,6 @@ xfs_file_splice_read(
 	unsigned int		flags)
 {
 	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
-	struct xfs_mount	*mp = ip->i_mount;
 	int			ioflags = 0;
 	ssize_t			ret;
 
@@ -335,18 +311,6 @@ xfs_file_splice_read(
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		int iolock = XFS_IOLOCK_SHARED;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-					FILP_DELAY_FLAG(infilp), &iolock);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return -error;
-		}
-	}
-
 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
 	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -367,7 +331,6 @@ xfs_file_splice_write(
 {
 	struct inode		*inode = outfilp->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fsize_t		isize, new_size;
 	int			ioflags = 0;
 	ssize_t			ret;
@@ -382,18 +345,6 @@ xfs_file_splice_write(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-		int iolock = XFS_IOLOCK_EXCL;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-					FILP_DELAY_FLAG(outfilp), &iolock);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-			return -error;
-		}
-	}
-
 	new_size = *ppos + count;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -463,7 +414,7 @@ xfs_zero_last_block(
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
 	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL, NULL);
+			  &nimaps, NULL);
 	if (error) {
 		return error;
 	}
@@ -558,7 +509,7 @@ xfs_zero_eof(
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
+				  0, NULL, 0, &imap, &nimaps, NULL);
 		if (error) {
 			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 			return error;
@@ -627,7 +578,6 @@ xfs_file_aio_write(
 	int			ioflags = 0;
 	xfs_fsize_t		isize, new_size;
 	int			iolock;
-	int			eventsent = 0;
 	size_t			ocount = 0, count;
 	int			need_i_mutex;
 
@@ -673,33 +623,6 @@ start:
 		goto out_unlock_mutex;
 	}
 
-	if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
-	    !(ioflags & IO_INVIS) && !eventsent)) {
-		int		dmflags = FILP_DELAY_FLAG(file);
-
-		if (need_i_mutex)
-			dmflags |= DM_FLAGS_IMUX;
-
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
-				      pos, count, dmflags, &iolock);
-		if (error) {
-			goto out_unlock_internal;
-		}
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		eventsent = 1;
-
-		/*
-		 * The iolock was dropped and reacquired in XFS_SEND_DATA
-		 * so we have to recheck the size when appending.
-		 * We will only "goto start;" once, since having sent the
-		 * event prevents another call to XFS_SEND_DATA, which is
-		 * what allows the size to change in the first place.
-		 */
-		if ((file->f_flags & O_APPEND) && pos != ip->i_size)
-			goto start;
-	}
-
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
@@ -830,22 +753,6 @@ write_retry:
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
-	if (ret == -ENOSPC &&
-	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-		xfs_iunlock(ip, iolock);
-		if (need_i_mutex)
-			mutex_unlock(&inode->i_mutex);
-		error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
-				DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
-				0, 0, 0); /* Delay flag intentionally  unused */
-		if (need_i_mutex)
-			mutex_lock(&inode->i_mutex);
-		xfs_ilock(ip, iolock);
-		if (error)
-			goto out_unlock_internal;
-		goto start;
-	}
-
 	error = -ret;
 	if (ret <= 0)
 		goto out_unlock_internal;
@@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = {
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
-	.open_exec	= xfs_file_open_exec,
-#endif
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7..ed88ed16811 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
 #include "xfs_inode.h"
 #include "xfs_trace.h"
 
-int  fs_noerr(void) { return 0; }
-int  fs_nosys(void) { return ENOSYS; }
-void fs_noval(void) { return; }
-
 /*
  * note: all filemap functions return negative error codes. These
  * need to be inverted before returning to the xfs core functions.
@@ -36,10 +32,9 @@ xfs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	struct address_space *mapping = VFS_I(ip)->i_mapping;
-
-	if (mapping->nrpages)
-		truncate_inode_pages(mapping, first);
+	/* can't toss partial tail pages, so mask them out */
+	last &= ~(PAGE_SIZE - 1);
+	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
 }
 
 int
@@ -54,12 +49,11 @@ xfs_flushinval_pages(
 
 	trace_xfs_pagecache_inval(ip, first, last);
 
-	if (mapping->nrpages) {
-		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_write_and_wait(mapping);
-		if (!ret)
-			truncate_inode_pages(mapping, first);
-	}
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = filemap_write_and_wait_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
+	if (!ret)
+		truncate_inode_pages_range(mapping, first, last);
 	return -ret;
 }
 
@@ -75,10 +69,9 @@ xfs_flush_pages(
 	int		ret = 0;
 	int		ret2;
 
-	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = -filemap_fdatawrite(mapping);
-	}
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	ret = -filemap_fdatawrite_range(mapping, first,
+				last == -1 ? LLONG_MAX : last);
 	if (flags & XBF_ASYNC)
 		return ret;
 	ret2 = xfs_wait_on_pages(ip, first, last);
@@ -95,7 +88,9 @@ xfs_wait_on_pages(
 {
 	struct address_space *mapping = VFS_I(ip)->i_mapping;
 
-	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
-		return -filemap_fdatawait(mapping);
+	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+		return -filemap_fdatawait_range(mapping, first,
+					last == -1 ? ip->i_size - 1 : last);
+	}
 	return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef	__XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-
-extern int  fs_noerr(void);
-extern int  fs_nosys(void);
-extern void fs_noval(void);
-
-#endif	/* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02..76e81cff70b 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_cred.h"
 #include "xfs_sysctl.h"
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061..00000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-
-extern uint64_t	xfs_panic_mask;		/* set to cause more panics */
-
-#endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e59a8106283..2ea238f6d38 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ioctl.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
@@ -794,10 +785,12 @@ xfs_ioc_fsgetxattr(
 {
 	struct fsxattr		fa;
 
+	memset(&fa, 0, sizeof(struct fsxattr));
+
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	fa.fsx_xflags = xfs_ip2xflags(ip);
 	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-	fa.fsx_projid = ip->i_d.di_projid;
+	fa.fsx_projid = xfs_get_projid(ip);
 
 	if (attr) {
 		if (ip->i_afp) {
@@ -908,7 +901,7 @@ xfs_ioctl_setattr(
 	struct xfs_dquot	*olddquot = NULL;
 	int			code;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_ioctl_setattr(ip);
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return XFS_ERROR(EROFS);
@@ -916,6 +909,13 @@ xfs_ioctl_setattr(
 		return XFS_ERROR(EIO);
 
 	/*
+	 * Disallow 32bit project ids when projid32bit feature is not enabled.
+	 */
+	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+			!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+		return XFS_ERROR(EINVAL);
+
+	/*
 	 * If disk quotas is on, we make sure that the dquots do exist on disk,
 	 * before we start any other transactions. Trying to do this later
 	 * is messy. We don't care to take a readlock to look at the ids
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
 	if (mask & FSX_PROJID) {
 		if (XFS_IS_QUOTA_RUNNING(mp) &&
 		    XFS_IS_PQUOTA_ON(mp) &&
-		    ip->i_d.di_projid != fa->fsx_projid) {
+		    xfs_get_projid(ip) != fa->fsx_projid) {
 			ASSERT(tp);
 			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
@@ -1043,8 +1043,7 @@ xfs_ioctl_setattr(
 		}
 	}
 
-	xfs_trans_ijoin(tp, ip, lock_flags);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 
 	/*
 	 * Change file ownership.  Must be the owner or privileged.
@@ -1064,12 +1063,12 @@ xfs_ioctl_setattr(
 		 * Change the ownerships and register quota modifications
 		 * in the transaction.
 		 */
-		if (ip->i_d.di_projid != fa->fsx_projid) {
+		if (xfs_get_projid(ip) != fa->fsx_projid) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
 				olddquot = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
-			ip->i_d.di_projid = fa->fsx_projid;
+			xfs_set_projid(ip, fa->fsx_projid);
 
 			/*
 			 * We may have to rev the inode as well as
@@ -1089,8 +1088,8 @@ xfs_ioctl_setattr(
 		xfs_diflags_to_linux(ip);
 	}
 
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
 
 	XFS_STATS_INC(xs_ig_attrchg);
 
@@ -1116,16 +1115,7 @@ xfs_ioctl_setattr(
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
 
-	if (code)
-		return code;
-
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
-		XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-				NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
-				(mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
-	}
-
-	return 0;
+	return code;
 
  error_return:
 	xfs_qm_dqrele(udqp);
@@ -1301,7 +1291,7 @@ xfs_file_ioctl(
 	if (filp->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_file_ioctl(ip);
 
 	switch (cmd) {
 	case XFS_IOC_ALLOCSP:
@@ -1311,7 +1301,8 @@ xfs_file_ioctl(
 	case XFS_IOC_ALLOCSP64:
 	case XFS_IOC_FREESP64:
 	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64: {
+	case XFS_IOC_UNRESVSP64:
+	case XFS_IOC_ZERO_RANGE: {
 		xfs_flock64_t		bf;
 
 		if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 52ed49e6465..b3486dfa552 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -168,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
 	    get_user(bstat->bs_extsize,	&bstat32->bs_extsize)	||
 	    get_user(bstat->bs_extents,	&bstat32->bs_extents)	||
 	    get_user(bstat->bs_gen,	&bstat32->bs_gen)	||
-	    get_user(bstat->bs_projid,	&bstat32->bs_projid)	||
+	    get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+	    get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
 	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
 	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
 	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -222,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
 	    put_user(buffer->bs_extents,  &p32->bs_extents)	||
 	    put_user(buffer->bs_gen,	  &p32->bs_gen)		||
 	    put_user(buffer->bs_projid,	  &p32->bs_projid)	||
+	    put_user(buffer->bs_projid_hi,	&p32->bs_projid_hi)	||
 	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
 	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
 	    put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -544,7 +542,7 @@ xfs_file_compat_ioctl(
 	if (filp->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_file_compat_ioctl(ip);
 
 	switch (cmd) {
 	/* No size or alignment issues on any arch */
@@ -578,6 +576,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_FSGEOMETRY_V1:
 	case XFS_IOC_FSGROWFSDATA:
 	case XFS_IOC_FSGROWFSRT:
+	case XFS_IOC_ZERO_RANGE:
 		return xfs_file_ioctl(filp, cmd, p);
 #else
 	case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0..08b605792a9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
 	__s32		bs_extsize;	/* extent size			*/
 	__s32		bs_extents;	/* number of extents		*/
 	__u32		bs_gen;		/* generation count		*/
-	__u16		bs_projid;	/* project id			*/
-	unsigned char	bs_pad[14];	/* pad space, unused		*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
+	__u16		bs_projid_hi;	/* high part of project id	*/
+	unsigned char	bs_pad[12];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 44f0b2de153..96107efc0c6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
@@ -88,7 +80,7 @@ xfs_mark_inode_dirty_sync(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty_sync(inode);
 }
 
@@ -98,46 +90,11 @@ xfs_mark_inode_dirty(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty(inode);
 }
 
 /*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- */
-void
-xfs_ichgtime(
-	xfs_inode_t	*ip,
-	int		flags)
-{
-	struct inode	*inode = VFS_I(ip);
-	timespec_t	tv;
-	int		sync_it = 0;
-
-	tv = current_fs_time(inode->i_sb);
-
-	if ((flags & XFS_ICHGTIME_MOD) &&
-	    !timespec_equal(&inode->i_mtime, &tv)) {
-		inode->i_mtime = tv;
-		sync_it = 1;
-	}
-	if ((flags & XFS_ICHGTIME_CHG) &&
-	    !timespec_equal(&inode->i_ctime, &tv)) {
-		inode->i_ctime = tv;
-		sync_it = 1;
-	}
-
-	/*
-	 * Update complete - now make sure everyone knows that the inode
-	 * is dirty.
-	 */
-	if (sync_it)
-		xfs_mark_inode_dirty_sync(ip);
-}
-
-/*
  * Hook in SELinux.  This is not quite correct yet, what we really need
  * here (as we do for default ACLs) is a mechanism by which creation of
  * these attrs can be journalled at inode creation time (along with the
@@ -232,7 +189,7 @@ xfs_vn_mknod(
 	}
 
 	xfs_dentry_to_name(&name, dentry);
-	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
+	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
 	if (unlikely(error))
 		goto out_free_acl;
 
@@ -360,7 +317,7 @@ xfs_vn_link(
 	if (unlikely(error))
 		return -error;
 
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -405,7 +362,7 @@ xfs_vn_symlink(
 		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
 	if (unlikely(error))
 		goto out;
 
@@ -496,7 +453,7 @@ xfs_vn_getattr(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_getattr(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -548,21 +505,6 @@ xfs_vn_setattr(
 	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
-/*
- * block_truncate_page can return an error, but we can't propagate it
- * at all here. Leave a complaint + stack trace in the syslog because
- * this could be bad. If it is bad, we need to propagate the error further.
- */
-STATIC void
-xfs_vn_truncate(
-	struct inode	*inode)
-{
-	int	error;
-	error = block_truncate_page(inode->i_mapping, inode->i_size,
-							xfs_get_blocks);
-	WARN_ON(error);
-}
-
 STATIC long
 xfs_vn_fallocate(
 	struct inode	*inode,
@@ -687,7 +629,7 @@ xfs_vn_fiemap(
 					fieinfo->fi_extents_max + 1;
 	bm.bmv_count = min_t(__s32, bm.bmv_count,
 			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-	bm.bmv_iflags = BMV_IF_PREALLOC;
+	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
 		bm.bmv_iflags |= BMV_IF_ATTRFORK;
 	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
@@ -702,7 +644,6 @@ xfs_vn_fiemap(
 
 static const struct inode_operations xfs_inode_operations = {
 	.check_acl		= xfs_check_acl,
-	.truncate		= xfs_vn_truncate,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -819,7 +760,9 @@ xfs_setup_inode(
 
 	inode->i_ino = ip->i_ino;
 	inode->i_state = I_NEW;
-	inode_add_to_lists(ip->i_mount->m_super, inode);
+
+	inode_sb_list_add(inode);
+	insert_inode_hash(inode);
 
 	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_nlink	= ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a70..214ddd71ff7 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
 #include <linux/random.h>
 #include <linux/ctype.h>
 #include <linux/writeback.h>
+#include <linux/capability.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -79,15 +80,12 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
 #include <xfs_aops.h>
 #include <xfs_super.h>
-#include <xfs_globals.h>
-#include <xfs_fs_subr.h>
 #include <xfs_buf.h>
 
 /*
@@ -145,7 +143,7 @@
 #define SYNCHRONIZE()	barrier()
 #define __return_address __builtin_return_address(0)
 
-#define dfltprid	0
+#define XFS_PROJID_DEFAULT	0
 #define MAXPATHLEN	1024
 
 #define MIN(a,b)	(min(a,b))
@@ -157,8 +155,6 @@
  */
 #define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()	dump_stack()
-#define xfs_itruncate_data(ip, off)	\
-	(-vmtruncate(VFS_I(ip), (off)))
 
 
 /* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 067cafbfc63..29b9d642e93 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,7 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_log.h"
@@ -69,15 +68,15 @@ xfs_fs_set_xstate(
 	if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
 
-	if (uflags & XFS_QUOTA_UDQ_ACCT)
+	if (uflags & FS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_PDQ_ACCT)
+	if (uflags & FS_QUOTA_PDQ_ACCT)
 		flags |= XFS_PQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_GDQ_ACCT)
+	if (uflags & FS_QUOTA_GDQ_ACCT)
 		flags |= XFS_GQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_UDQ_ENFD)
+	if (uflags & FS_QUOTA_UDQ_ENFD)
 		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+	if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
 		flags |= XFS_OQUOTA_ENFD;
 
 	switch (op) {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 80938c736c2..9f3a78fe6ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -43,12 +40,10 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
@@ -94,7 +89,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
 					 * unwritten extent conversion */
 #define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
-#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
@@ -116,9 +110,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
-#define MNTOPT_DMAPI	"dmapi"		/* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_DMI	"dmi"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DELAYLOG   "delaylog"	/* Delayed loging enabled */
 #define MNTOPT_NODELAYLOG "nodelaylog"	/* Delayed loging disabled */
 
@@ -172,15 +163,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
 STATIC int
 xfs_parseargs(
 	struct xfs_mount	*mp,
-	char			*options,
-	char			**mtpt)
+	char			*options)
 {
 	struct super_block	*sb = mp->m_super;
 	char			*this_char, *value, *eov;
 	int			dsunit = 0;
 	int			dswidth = 0;
 	int			iosize = 0;
-	int			dmapi_implies_ikeep = 1;
 	__uint8_t		iosizelog = 0;
 
 	/*
@@ -243,15 +232,10 @@ xfs_parseargs(
 			if (!mp->m_logname)
 				return ENOMEM;
 		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
-			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
-					this_char);
-				return EINVAL;
-			}
-			*mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
-			if (!*mtpt)
-				return ENOMEM;
+			cmn_err(CE_WARN,
+				"XFS: %s option not allowed on this system",
+				this_char);
+			return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
 			if (!value || !*value) {
 				cmn_err(CE_WARN,
@@ -288,8 +272,6 @@ xfs_parseargs(
 			mp->m_flags &= ~XFS_MOUNT_GRPID;
 		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
 			mp->m_flags |= XFS_MOUNT_WSYNC;
-		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-			mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
 		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
 			mp->m_flags |= XFS_MOUNT_NORECOVERY;
 		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -329,7 +311,6 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
 			mp->m_flags |= XFS_MOUNT_IKEEP;
 		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
-			dmapi_implies_ikeep = 0;
 			mp->m_flags &= ~XFS_MOUNT_IKEEP;
 		} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
 			mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -370,12 +351,6 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
-		} else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-			mp->m_flags |= XFS_MOUNT_DMAPI;
-		} else if (!strcmp(this_char, MNTOPT_XDSM)) {
-			mp->m_flags |= XFS_MOUNT_DMAPI;
-		} else if (!strcmp(this_char, MNTOPT_DMI)) {
-			mp->m_flags |= XFS_MOUNT_DMAPI;
 		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
 			cmn_err(CE_WARN,
@@ -387,9 +362,11 @@ xfs_parseargs(
 			cmn_err(CE_WARN,
 	"XFS: ihashsize no longer used, option is deprecated.");
 		} else if (!strcmp(this_char, "osyncisdsync")) {
-			/* no-op, this is now the default */
 			cmn_err(CE_WARN,
-	"XFS: osyncisdsync is now the default, option is deprecated.");
+	"XFS: osyncisdsync has no effect, option is deprecated.");
+		} else if (!strcmp(this_char, "osyncisosync")) {
+			cmn_err(CE_WARN,
+	"XFS: osyncisosync has no effect, option is deprecated.");
 		} else if (!strcmp(this_char, "irixsgid")) {
 			cmn_err(CE_WARN,
 	"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -430,12 +407,6 @@ xfs_parseargs(
 		return EINVAL;
 	}
 
-	if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
-		printk("XFS: %s option needs the mount point option as well\n",
-			MNTOPT_DMAPI);
-		return EINVAL;
-	}
-
 	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
 		cmn_err(CE_WARN,
 			"XFS: sunit and swidth must be specified together");
@@ -449,18 +420,6 @@ xfs_parseargs(
 		return EINVAL;
 	}
 
-	/*
-	 * Applications using DMI filesystems often expect the
-	 * inode generation number to be monotonically increasing.
-	 * If we delete inode chunks we break this assumption, so
-	 * keep unused inode chunks on disk for DMI filesystems
-	 * until we come up with a better solution.
-	 * Note that if "ikeep" or "noikeep" mount options are
-	 * supplied, then they are honored.
-	 */
-	if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-		mp->m_flags |= XFS_MOUNT_IKEEP;
-
 done:
 	if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
 		/*
@@ -539,10 +498,8 @@ xfs_showargs(
 		{ XFS_MOUNT_SWALLOC,		"," MNTOPT_SWALLOC },
 		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
 		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
-		{ XFS_MOUNT_OSYNCISOSYNC,	"," MNTOPT_OSYNCISOSYNC },
 		{ XFS_MOUNT_ATTR2,		"," MNTOPT_ATTR2 },
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
-		{ XFS_MOUNT_DMAPI,		"," MNTOPT_DMAPI },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
 		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
 		{ 0, NULL }
@@ -619,7 +576,7 @@ xfs_max_file_offset(
 
 	/* Figure out maximum filesize, on Linux this can depend on
 	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_prepare_write does this in an [unsigned] long...
+	 * __block_write_begin does this in an [unsigned] long...
 	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
 	 * So, for page sized blocks (4K on 32 bit platforms),
 	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -687,7 +644,7 @@ xfs_barrier_test(
 	XFS_BUF_ORDERED(sbp);
 
 	xfsbdstrat(mp, sbp);
-	error = xfs_iowait(sbp);
+	error = xfs_buf_iowait(sbp);
 
 	/*
 	 * Clear all the flags we set and possible error state in the
@@ -735,8 +692,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
 }
 
 STATIC void
@@ -800,18 +756,20 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
 	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+							mp->m_fsname);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
 	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+							mp->m_fsname);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
@@ -947,7 +905,7 @@ xfs_fs_destroy_inode(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 
-	xfs_itrace_entry(ip);
+	trace_xfs_destroy_inode(ip);
 
 	XFS_STATS_INC(vn_reclaim);
 
@@ -1014,12 +972,7 @@ xfs_fs_inode_init_once(
 
 /*
  * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode. Care must be taken
- * here - the transaction code calls mark_inode_dirty_sync() to mark the
- * VFS inode dirty in a transaction and clears the i_update_core field;
- * it must clear the field after calling mark_inode_dirty_sync() to
- * correctly indicate that the dirty state has been propagated into the
- * inode log item.
+ * we catch unlogged VFS level updates to the inode.
  *
  * We need the barrier() to maintain correct ordering between unlogged
  * updates and the transaction commit code that clears the i_update_core
@@ -1063,10 +1016,8 @@ xfs_log_inode(
 	 * an inode in another recent transaction.  So we play it safe and
 	 * fire off the transaction anyway.
 	 */
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_set_sync(tp);
 	error = xfs_trans_commit(tp, 0);
 	xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
 
@@ -1082,27 +1033,18 @@ xfs_fs_write_inode(
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error = EAGAIN;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_write_inode(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		/*
-		 * Make sure the inode has hit stable storage.  By using the
-		 * log and the fsync transactions we reduce the IOs we have
-		 * to do here from two (log and inode) to just the log.
-		 *
-		 * Note: We still need to do a delwri write of the inode after
-		 * this to flush it to the backing buffer so that bulkstat
-		 * works properly if this is the first time the inode has been
-		 * written.  Because we hold the ilock atomically over the
-		 * transaction commit and the inode flush we are guaranteed
-		 * that the inode is not pinned when it returns. If the flush
-		 * lock is already held, then the inode has already been
-		 * flushed once and we don't need to flush it again.  Hence
-		 * the code will only flush the inode if it isn't already
-		 * being flushed.
+		 * Make sure the inode has made it it into the log.  Instead
+		 * of forcing it all the way to stable storage using a
+		 * synchronous transaction we let the log force inside the
+		 * ->sync_fs call do that for thus, which reduces the number
+		 * of synchronous log foces dramatically.
 		 */
 		xfs_ioend_wait(ip);
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1116,27 +1058,29 @@ xfs_fs_write_inode(
 		 * We make this non-blocking if the inode is contended, return
 		 * EAGAIN to indicate to the caller that they did not succeed.
 		 * This prevents the flush path from blocking on inodes inside
-		 * another operation right now, they get caught later by xfs_sync.
+		 * another operation right now, they get caught later by
+		 * xfs_sync.
 		 */
 		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 			goto out;
-	}
 
-	if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-		goto out_unlock;
+		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+			goto out_unlock;
 
-	/*
-	 * Now we have the flush lock and the inode is not pinned, we can check
-	 * if the inode is really clean as we know that there are no pending
-	 * transaction completions, it is not waiting on the delayed write
-	 * queue and there is no IO in progress.
-	 */
-	if (xfs_inode_clean(ip)) {
-		xfs_ifunlock(ip);
-		error = 0;
-		goto out_unlock;
+		/*
+		 * Now we have the flush lock and the inode is not pinned, we
+		 * can check if the inode is really clean as we know that
+		 * there are no pending transaction completions, it is not
+		 * waiting on the delayed write queue and there is no IO in
+		 * progress.
+		 */
+		if (xfs_inode_clean(ip)) {
+			xfs_ifunlock(ip);
+			error = 0;
+			goto out_unlock;
+		}
+		error = xfs_iflush(ip, 0);
 	}
-	error = xfs_iflush(ip, 0);
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1151,12 +1095,15 @@ xfs_fs_write_inode(
 }
 
 STATIC void
-xfs_fs_clear_inode(
+xfs_fs_evict_inode(
 	struct inode		*inode)
 {
 	xfs_inode_t		*ip = XFS_I(inode);
 
-	xfs_itrace_entry(ip);
+	trace_xfs_evict_inode(ip);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	XFS_STATS_INC(vn_rele);
 	XFS_STATS_INC(vn_remove);
 	XFS_STATS_DEC(vn_active);
@@ -1193,22 +1140,13 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	/*
+	 * Unregister the memory shrinker before we tear down the mount
+	 * structure so we don't have memory reclaim racing with us here.
+	 */
+	xfs_inode_shrinker_unregister(mp);
 	xfs_syncd_stop(mp);
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		/*
-		 * XXX(hch): this should be SYNC_WAIT.
-		 *
-		 * Or more likely not needed at all because the VFS is already
-		 * calling ->sync_fs after shutting down all filestem
-		 * operations and just before calling ->put_super.
-		 */
-		xfs_sync_data(mp, 0);
-		xfs_sync_attr(mp, 0);
-	}
-
-	XFS_SEND_PREUNMOUNT(mp);
-
 	/*
 	 * Blow away any referenced inode in the filestreams cache.
 	 * This can and will cause log traffic as inodes go inactive
@@ -1218,14 +1156,10 @@ xfs_fs_put_super(
 
 	XFS_bflush(mp->m_ddev_targp);
 
-	XFS_SEND_UNMOUNT(mp);
-
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
-	xfs_inode_shrinker_unregister(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_close_devices(mp);
-	xfs_dmops_put(mp);
 	xfs_free_fsname(mp);
 	kfree(mp);
 }
@@ -1287,6 +1221,7 @@ xfs_fs_statfs(
 	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
 	__uint64_t		fakeinos, id;
 	xfs_extlen_t		lsize;
+	__int64_t		ffree;
 
 	statp->f_type = XFS_SB_MAGIC;
 	statp->f_namelen = MAXNAMELEN - 1;
@@ -1310,7 +1245,11 @@ xfs_fs_statfs(
 		statp->f_files = min_t(typeof(statp->f_files),
 					statp->f_files,
 					mp->m_maxicount);
-	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+
+	/* make sure statp->f_ffree does not underflow */
+	ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+	statp->f_ffree = max_t(__int64_t, ffree, 0);
+
 	spin_unlock(&mp->m_sb_lock);
 
 	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1463,7 +1402,7 @@ xfs_fs_freeze(
 
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
-	return -xfs_fs_log_dummy(mp);
+	return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 
 STATIC int
@@ -1543,7 +1482,6 @@ xfs_fs_fill_super(
 	struct inode		*root;
 	struct xfs_mount	*mp = NULL;
 	int			flags = 0, error = ENOMEM;
-	char			*mtpt = NULL;
 
 	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
 	if (!mp)
@@ -1559,7 +1497,7 @@ xfs_fs_fill_super(
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
 
-	error = xfs_parseargs(mp, (char *)data, &mtpt);
+	error = xfs_parseargs(mp, (char *)data);
 	if (error)
 		goto out_free_fsname;
 
@@ -1571,19 +1509,16 @@ xfs_fs_fill_super(
 #endif
 	sb->s_op = &xfs_super_operations;
 
-	error = xfs_dmops_get(mp);
-	if (error)
-		goto out_free_fsname;
-
 	if (silent)
 		flags |= XFS_MFSI_QUIET;
 
 	error = xfs_open_devices(mp);
 	if (error)
-		goto out_put_dmops;
+		goto out_free_fsname;
 
-	if (xfs_icsb_init_counters(mp))
-		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+	error = xfs_icsb_init_counters(mp);
+	if (error)
+		goto out_close_devices;
 
 	error = xfs_readsb(mp, flags);
 	if (error)
@@ -1608,8 +1543,6 @@ xfs_fs_fill_super(
 	if (error)
 		goto out_filestream_unmount;
 
-	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
-
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
 	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1638,7 +1571,6 @@ xfs_fs_fill_super(
 
 	xfs_inode_shrinker_register(mp);
 
-	kfree(mtpt);
 	return 0;
 
  out_filestream_unmount:
@@ -1647,12 +1579,10 @@ xfs_fs_fill_super(
 	xfs_freesb(mp);
  out_destroy_counters:
 	xfs_icsb_destroy_counters(mp);
+ out_close_devices:
 	xfs_close_devices(mp);
- out_put_dmops:
-	xfs_dmops_put(mp);
  out_free_fsname:
 	xfs_free_fsname(mp);
-	kfree(mtpt);
 	kfree(mp);
  out:
 	return -error;
@@ -1679,16 +1609,14 @@ xfs_fs_fill_super(
 	goto out_free_sb;
 }
 
-STATIC int
-xfs_fs_get_sb(
+STATIC struct dentry *
+xfs_fs_mount(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data,
-	struct vfsmount		*mnt)
+	void			*data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
-			   mnt);
+	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
 }
 
 static const struct super_operations xfs_super_operations = {
@@ -1696,7 +1624,7 @@ static const struct super_operations xfs_super_operations = {
 	.destroy_inode		= xfs_fs_destroy_inode,
 	.dirty_inode		= xfs_fs_dirty_inode,
 	.write_inode		= xfs_fs_write_inode,
-	.clear_inode		= xfs_fs_clear_inode,
+	.evict_inode		= xfs_fs_evict_inode,
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,
@@ -1709,7 +1637,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= "xfs",
-	.get_sb			= xfs_fs_get_sb,
+	.mount			= xfs_fs_mount,
 	.kill_sb		= kill_block_super,
 	.fs_flags		= FS_REQUIRES_DEV,
 };
@@ -1759,6 +1687,12 @@ xfs_init_zones(void)
 	if (!xfs_trans_zone)
 		goto out_destroy_ifork_zone;
 
+	xfs_log_item_desc_zone =
+		kmem_zone_init(sizeof(struct xfs_log_item_desc),
+			       "xfs_log_item_desc");
+	if (!xfs_log_item_desc_zone)
+		goto out_destroy_trans_zone;
+
 	/*
 	 * The size of the zone allocated buf log item is the maximum
 	 * size possible under XFS.  This wastes a little bit of memory,
@@ -1768,7 +1702,7 @@ xfs_init_zones(void)
 				(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
 				  NBWORD) * sizeof(int))), "xfs_buf_item");
 	if (!xfs_buf_item_zone)
-		goto out_destroy_trans_zone;
+		goto out_destroy_log_item_desc_zone;
 
 	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
 			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1805,6 +1739,8 @@ xfs_init_zones(void)
 	kmem_zone_destroy(xfs_efd_zone);
  out_destroy_buf_item_zone:
 	kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+	kmem_zone_destroy(xfs_log_item_desc_zone);
  out_destroy_trans_zone:
 	kmem_zone_destroy(xfs_trans_zone);
  out_destroy_ifork_zone:
@@ -1835,6 +1771,7 @@ xfs_destroy_zones(void)
 	kmem_zone_destroy(xfs_efi_zone);
 	kmem_zone_destroy(xfs_efd_zone);
 	kmem_zone_destroy(xfs_buf_item_zone);
+	kmem_zone_destroy(xfs_log_item_desc_zone);
 	kmem_zone_destroy(xfs_trans_zone);
 	kmem_zone_destroy(xfs_ifork_zone);
 	kmem_zone_destroy(xfs_dabuf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 519618e9279..50a3266c999 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,23 +56,17 @@ extern void xfs_qm_exit(void);
 # define XFS_BIGFS_STRING
 #endif
 
-#ifdef CONFIG_XFS_DMAPI
-# define XFS_DMAPI_STRING	"dmapi support, "
-#else
-# define XFS_DMAPI_STRING
-#endif
-
 #ifdef DEBUG
 # define XFS_DBG_STRING		"debug"
 #else
 # define XFS_DBG_STRING		"no debug"
 #endif
 
+#define XFS_VERSION_STRING	"SGI XFS"
 #define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
 				XFS_SECURITY_STRING \
 				XFS_REALTIME_STRING \
 				XFS_BIGFS_STRING \
-				XFS_DMAPI_STRING \
 				XFS_DBG_STRING /* DBG must be last */
 
 struct xfs_inode;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a51a07c3a70..37d33254981 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,67 +24,54 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_inode.h"
 #include "xfs_dinode.h"
 #include "xfs_error.h"
-#include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
-#include "xfs_utils.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
-#include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
 
-STATIC xfs_inode_t *
-xfs_inode_ag_lookup(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	uint32_t		*first_index,
-	int			tag)
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
 {
-	int			nr_found;
-	struct xfs_inode	*ip;
+	struct inode		*inode = VFS_I(ip);
 
-	/*
-	 * use a gang lookup to find the next inode in the tree
-	 * as the tree is sparse and a gang lookup walks to find
-	 * the number of objects requested.
-	 */
-	if (tag == XFS_ICI_NO_TAG) {
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-				(void **)&ip, *first_index, 1);
-	} else {
-		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-				(void **)&ip, *first_index, 1, tag);
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		return ENOENT;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
 	}
-	if (!nr_found)
-		return NULL;
 
-	/*
-	 * Update the index for the next lookup. Catch overflows
-	 * into the next AG range which can occur if we have inodes
-	 * in the last block of the AG and we are currently
-	 * pointing to the last inode.
-	 */
-	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-		return NULL;
-	return ip;
+	/* inode is valid */
+	return 0;
 }
 
 STATIC int
@@ -93,49 +80,75 @@ xfs_inode_ag_walk(
 	struct xfs_perag	*pag,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
-	int			flags,
-	int			tag,
-	int			exclusive,
-	int			*nr_to_scan)
+	int			flags)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
 	int			skipped;
+	int			done;
+	int			nr_found;
 
 restart:
+	done = 0;
 	skipped = 0;
 	first_index = 0;
+	nr_found = 0;
 	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 		int		error = 0;
-		xfs_inode_t	*ip;
+		int		i;
 
-		if (exclusive)
-			write_lock(&pag->pag_ici_lock);
-		else
-			read_lock(&pag->pag_ici_lock);
-		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-		if (!ip) {
-			if (exclusive)
-				write_unlock(&pag->pag_ici_lock);
-			else
-				read_unlock(&pag->pag_ici_lock);
+		read_lock(&pag->pag_ici_lock);
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		if (!nr_found) {
+			read_unlock(&pag->pag_ici_lock);
 			break;
 		}
 
-		/* execute releases pag->pag_ici_lock */
-		error = execute(ip, pag, flags);
-		if (error == EAGAIN) {
-			skipped++;
-			continue;
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch overflows
+			 * into the next AG range which can occur if we have inodes
+			 * in the last block of the AG and we are currently
+			 * pointing to the last inode.
+			 */
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		read_unlock(&pag->pag_ici_lock);
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
 		}
-		if (error)
-			last_error = error;
 
 		/* bail out if the filesystem is corrupted.  */
 		if (error == EFSCORRUPTED)
 			break;
 
-	} while ((*nr_to_scan)--);
+	} while (nr_found && !done);
 
 	if (skipped) {
 		delay(1);
@@ -144,110 +157,32 @@ restart:
 	return last_error;
 }
 
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		*first,
-	int			tag)
-{
-	struct xfs_perag	*pag = NULL;
-
-	if (tag == XFS_ICI_RECLAIM_TAG) {
-		int found;
-		int ref;
-
-		spin_lock(&mp->m_perag_lock);
-		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-				(void **)&pag, *first, 1, tag);
-		if (found <= 0) {
-			spin_unlock(&mp->m_perag_lock);
-			return NULL;
-		}
-		*first = pag->pag_agno + 1;
-		/* open coded pag reference increment */
-		ref = atomic_inc_return(&pag->pag_ref);
-		spin_unlock(&mp->m_perag_lock);
-		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
-	} else {
-		pag = xfs_perag_get(mp, *first);
-		(*first)++;
-	}
-	return pag;
-}
-
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
-	int			flags,
-	int			tag,
-	int			exclusive,
-	int			*nr_to_scan)
+	int			flags)
 {
 	struct xfs_perag	*pag;
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
-	int			nr;
 
-	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
 	ag = 0;
-	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
-		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-						exclusive, &nr);
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
 				break;
 		}
-		if (nr <= 0)
-			break;
 	}
-	if (nr_to_scan)
-		*nr_to_scan = nr;
 	return XFS_ERROR(last_error);
 }
 
-/* must be called with pag_ici_lock held and releases it */
-int
-xfs_sync_inode_valid(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag)
-{
-	struct inode		*inode = VFS_I(ip);
-	int			error = EFSCORRUPTED;
-
-	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		goto out_unlock;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	error = ENOENT;
-	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock;
-
-	/* If we can't grab the inode, it must on it's way to reclaim. */
-	if (!igrab(inode))
-		goto out_unlock;
-
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		goto out_unlock;
-	}
-
-	/* inode is valid */
-	error = 0;
-out_unlock:
-	read_unlock(&pag->pag_ici_lock);
-	return error;
-}
-
 STATIC int
 xfs_sync_inode_data(
 	struct xfs_inode	*ip,
@@ -258,10 +193,6 @@ xfs_sync_inode_data(
 	struct address_space *mapping = inode->i_mapping;
 	int			error = 0;
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_wait;
 
@@ -278,7 +209,6 @@ xfs_sync_inode_data(
  out_wait:
 	if (flags & SYNC_WAIT)
 		xfs_ioend_wait(ip);
-	IRELE(ip);
 	return error;
 }
 
@@ -290,10 +220,6 @@ xfs_sync_inode_attr(
 {
 	int			error = 0;
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	if (xfs_inode_clean(ip))
 		goto out_unlock;
@@ -312,14 +238,13 @@ xfs_sync_inode_attr(
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	IRELE(ip);
 	return error;
 }
 
 /*
  * Write out pagecache data for the whole filesystem.
  */
-int
+STATIC int
 xfs_sync_data(
 	struct xfs_mount	*mp,
 	int			flags)
@@ -328,8 +253,7 @@ xfs_sync_data(
 
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG, 0, NULL);
+	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -340,48 +264,14 @@ xfs_sync_data(
 /*
  * Write out inode metadata (attributes) for the whole filesystem.
  */
-int
+STATIC int
 xfs_sync_attr(
 	struct xfs_mount	*mp,
 	int			flags)
 {
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
-	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG, 0, NULL);
-}
-
-STATIC int
-xfs_commit_dummy_trans(
-	struct xfs_mount	*mp,
-	uint			flags)
-{
-	struct xfs_inode	*ip = mp->m_rootip;
-	struct xfs_trans	*tp;
-	int			error;
-
-	/*
-	 * Put a dummy transaction in the log to tell recovery
-	 * that all others are OK.
-	 */
-	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = xfs_trans_commit(tp, 0);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	/* the log force ensures this transaction is pushed to disk */
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return error;
+	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
 }
 
 STATIC int
@@ -444,7 +334,7 @@ xfs_quiesce_data(
 
 	/* mark the log as covered if needed */
 	if (xfs_log_need_covered(mp))
-		error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
@@ -575,7 +465,7 @@ xfs_flush_inodes(
 /*
  * Every sync period we need to unpin all items, reclaim inodes and sync
  * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
  */
 STATIC void
 xfs_sync_worker(
@@ -589,8 +479,9 @@ xfs_sync_worker(
 		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-		if (xfs_log_need_covered(mp))
-			error = xfs_commit_dummy_trans(mp, 0);
+		if (mp->m_super->s_frozen == SB_UNFROZEN &&
+		    xfs_log_need_covered(mp))
+			error = xfs_fs_log_dummy(mp, 0);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
@@ -710,14 +601,11 @@ xfs_inode_set_reclaim_tag(
 	xfs_perag_put(pag);
 }
 
-void
-__xfs_inode_clear_reclaim_tag(
-	xfs_mount_t	*mp,
+STATIC void
+__xfs_inode_clear_reclaim(
 	xfs_perag_t	*pag,
 	xfs_inode_t	*ip)
 {
-	radix_tree_tag_clear(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 	pag->pag_ici_reclaimable--;
 	if (!pag->pag_ici_reclaimable) {
 		/* clear the reclaim tag from the perag radix tree */
@@ -731,6 +619,54 @@ __xfs_inode_clear_reclaim_tag(
 	}
 }
 
+void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+
+	/*
+	 * do some unlocked checks first to avoid unnecceary lock traffic.
+	 * The first is a flush lock check, the second is a already in reclaim
+	 * check. Only do these checks if we are not going to block on locks.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+		return 1;
+	}
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* ignore as it is already under reclaim */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
 /*
  * Inodes in different states need to be treated differently, and the return
  * value of xfs_iflush is not sufficient to get this right. The following table
@@ -789,23 +725,6 @@ xfs_reclaim_inode(
 {
 	int	error = 0;
 
-	/*
-	 * The radix tree lock here protects a thread in xfs_iget from racing
-	 * with us starting reclaim on the inode.  Once we have the
-	 * XFS_IRECLAIM flag set it will not touch us.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* ignore as it is already under reclaim */
-		spin_unlock(&ip->i_flags_lock);
-		write_unlock(&pag->pag_ici_lock);
-		return 0;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	write_unlock(&pag->pag_ici_lock);
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (!xfs_iflock_nowait(ip)) {
 		if (!(sync_mode & SYNC_WAIT))
@@ -867,18 +786,161 @@ out:
 reclaim:
 	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	xfs_ireclaim(ip);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	write_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	write_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.  We get
+	 * both the ilock and the iolock because the code may need to drop the
+	 * ilock one but will still hold the iolock.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	xfs_inode_free(ip);
 	return error;
 
 }
 
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			write_lock(&pag->pag_ici_lock);
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				write_unlock(&pag->pag_ici_lock);
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 */
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			write_unlock(&pag->pag_ici_lock);
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (trylock && skipped && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return XFS_ERROR(last_error);
+}
+
 int
 xfs_reclaim_inodes(
 	xfs_mount_t	*mp,
 	int		mode)
 {
-	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-					XFS_ICI_RECLAIM_TAG, 1, NULL);
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 }
 
 /*
@@ -900,17 +962,16 @@ xfs_reclaim_inode_shrink(
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 
-		xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
-					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-		/* if we don't exhaust the scan, don't bother coming back */
+		xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+		/* terminate if we don't exhaust the scan */
 		if (nr_to_scan > 0)
 			return -1;
        }
 
 	reclaimable = 0;
 	ag = 0;
-	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
-					XFS_ICI_RECLAIM_TAG))) {
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
 		reclaimable += pag->pag_ici_reclaimable;
 		xfs_perag_put(pag);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index e28139aaa4a..32ba6628290 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
 
-int xfs_sync_attr(struct xfs_mount *mp, int flags);
-int xfs_sync_data(struct xfs_mount *mp, int flags);
-
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
@@ -50,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
 
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag, int write_lock, int *nr_to_scan);
+	int flags);
 
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index d12be8470cb..88d25d4aa56 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,17 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 30282069090..acef2e98c59 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
 		 unsigned long caller_ip),					\
 	TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -317,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
 DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
-DEFINE_BUF_EVENT(xfs_buf_pin);
-DEFINE_BUF_EVENT(xfs_buf_unpin);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -327,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -541,7 +538,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
 DEFINE_LOCK_EVENT(xfs_ilock_demote);
 DEFINE_LOCK_EVENT(xfs_iunlock);
 
-DECLARE_EVENT_CLASS(xfs_iget_class,
+DECLARE_EVENT_CLASS(xfs_inode_class,
 	TP_PROTO(struct xfs_inode *ip),
 	TP_ARGS(ip),
 	TP_STRUCT__entry(
@@ -557,16 +554,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
 		  __entry->ino)
 )
 
-#define DEFINE_IGET_EVENT(name) \
-DEFINE_EVENT(xfs_iget_class, name, \
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
 	TP_PROTO(struct xfs_inode *ip), \
 	TP_ARGS(ip))
-DEFINE_IGET_EVENT(xfs_iget_skip);
-DEFINE_IGET_EVENT(xfs_iget_reclaim);
-DEFINE_IGET_EVENT(xfs_iget_found);
-DEFINE_IGET_EVENT(xfs_iget_alloc);
-
-DECLARE_EVENT_CLASS(xfs_inode_class,
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
 	TP_ARGS(ip, caller_ip),
 	TP_STRUCT__entry(
@@ -591,20 +610,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
 		  (char *)__entry->caller_ip)
 )
 
-#define DEFINE_INODE_EVENT(name) \
-DEFINE_EVENT(xfs_inode_class, name, \
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
 	TP_ARGS(ip, caller_ip))
-DEFINE_INODE_EVENT(xfs_ihold);
-DEFINE_INODE_EVENT(xfs_irele);
-DEFINE_INODE_EVENT(xfs_inode_pin);
-DEFINE_INODE_EVENT(xfs_inode_unpin);
-DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+	TP_ARGS(dp, name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dp_ino)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(dp)->i_sb->s_dev;
+		__entry->dp_ino = dp->i_ino;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dp ino 0x%llx name %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dp_ino,
+		  __get_str(name))
+)
 
-/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
-DEFINE_INODE_EVENT(xfs_inode);
-#define xfs_itrace_entry(ip)    \
-	trace_xfs_inode(ip, _THIS_IP_)
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+	TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+	TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+	TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+		 struct xfs_name *src_name, struct xfs_name *target_name),
+	TP_ARGS(src_dp, target_dp, src_name, target_name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_dp_ino)
+		__field(xfs_ino_t, target_dp_ino)
+		__dynamic_array(char, src_name, src_name->len)
+		__dynamic_array(char, target_name, target_name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+		__entry->src_dp_ino = src_dp->i_ino;
+		__entry->target_dp_ino = target_dp->i_ino;
+		memcpy(__get_str(src_name), src_name->name, src_name->len);
+		memcpy(__get_str(target_name), target_name->name, target_name->len);
+	),
+	TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+		  " src name %s target name %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_dp_ino,
+		  __entry->target_dp_ino,
+		  __get_str(src_name),
+		  __get_str(target_name))
+)
 
 DECLARE_EVENT_CLASS(xfs_dquot_class,
 	TP_PROTO(struct xfs_dquot *dqp),
@@ -684,9 +754,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
 DEFINE_DQUOT_EVENT(xfs_dqflush_force);
 DEFINE_DQUOT_EVENT(xfs_dqflush_done);
-/* not really iget events, but we re-use the format */
-DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
-DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
 
 DECLARE_EVENT_CLASS(xfs_loggrant_class,
 	TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -834,33 +901,29 @@ DECLARE_EVENT_CLASS(xfs_page_class,
 		__field(loff_t, size)
 		__field(unsigned long, offset)
 		__field(int, delalloc)
-		__field(int, unmapped)
 		__field(int, unwritten)
 	),
 	TP_fast_assign(
-		int delalloc = -1, unmapped = -1, unwritten = -1;
+		int delalloc = -1, unwritten = -1;
 
 		if (page_has_buffers(page))
-			xfs_count_page_state(page, &delalloc,
-					     &unmapped, &unwritten);
+			xfs_count_page_state(page, &delalloc, &unwritten);
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = XFS_I(inode)->i_ino;
 		__entry->pgoff = page_offset(page);
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
 		__entry->delalloc = delalloc;
-		__entry->unmapped = unmapped;
 		__entry->unwritten = unwritten;
 	),
 	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-		  "delalloc %d unmapped %d unwritten %d",
+		  "delalloc %d unwritten %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->pgoff,
 		  __entry->size,
 		  __entry->offset,
 		  __entry->delalloc,
-		  __entry->unmapped,
 		  __entry->unwritten)
 )
 
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563..00000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VERSION_H__
-#define __XFS_VERSION_H__
-
-/*
- * Dummy file that can contain a timestamp to put into the
- * XFS init string, to help users keep track of what they're
- * running
- */
-
-#define XFS_VERSION_STRING "SGI XFS"
-
-#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 585e7633dfc..faf8e1a83a1 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
    flush lock - ditto.
 */
 
-STATIC void		xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
-
 #ifdef DEBUG
 xfs_buftarg_t *xfs_dqerror_target;
 int xfs_do_dqerror;
@@ -390,21 +378,14 @@ xfs_qm_dqalloc(
 		return (ESRCH);
 	}
 
-	/*
-	 * xfs_trans_commit normally decrements the vnode ref count
-	 * when it unlocks the inode. Since we want to keep the quota
-	 * inode around, we bump the vnode ref count now.
-	 */
-	IHOLD(quotip);
-
-	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
 	if ((error = xfs_bmapi(tp, quotip,
 			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
 			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
 			      &firstblock,
 			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist, NULL))) {
+			      &map, &nmaps, &flist))) {
 		goto error0;
 	}
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -482,87 +463,68 @@ xfs_qm_dqtobp(
 	uint			flags)
 {
 	xfs_bmbt_irec_t map;
-	int		nmaps, error;
+	int		nmaps = 1, error;
 	xfs_buf_t	*bp;
-	xfs_inode_t	*quotip;
-	xfs_mount_t	*mp;
+	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
+	xfs_mount_t	*mp = dqp->q_mount;
 	xfs_disk_dquot_t *ddq;
-	xfs_dqid_t	id;
-	boolean_t	newdquot;
+	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
 	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
 
-	mp = dqp->q_mount;
-	id = be32_to_cpu(dqp->q_core.d_id);
-	nmaps = 1;
-	newdquot = B_FALSE;
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
-	/*
-	 * If we don't know where the dquot lives, find out.
-	 */
-	if (dqp->q_blkno == (xfs_daddr_t) 0) {
-		/* We use the id as an index */
-		dqp->q_fileoffset = (xfs_fileoff_t)id /
-					mp->m_quotainfo->qi_dqperchunk;
-		nmaps = 1;
-		quotip = XFS_DQ_TO_QIP(dqp);
-		xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	xfs_ilock(quotip, XFS_ILOCK_SHARED);
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
 		/*
-		 * Return if this type of quotas is turned off while we didn't
-		 * have an inode lock
+		 * Return if this type of quotas is turned off while we
+		 * didn't have the quota inode lock.
 		 */
-		if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-			xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-			return (ESRCH);
-		}
+		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		return ESRCH;
+	}
+
+	/*
+	 * Find the block map; no allocations yet
+	 */
+	error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+			  XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+			  NULL, 0, &map, &nmaps, NULL);
+
+	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+	if (error)
+		return error;
+
+	ASSERT(nmaps == 1);
+	ASSERT(map.br_blockcount == 1);
+
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+		sizeof(xfs_dqblk_t);
+
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK) {
 		/*
-		 * Find the block map; no allocations yet
+		 * We don't allocate unless we're asked to
 		 */
-		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-				  XFS_DQUOT_CLUSTER_SIZE_FSB,
-				  XFS_BMAPI_METADATA,
-				  NULL, 0, &map, &nmaps, NULL, NULL);
+		if (!(flags & XFS_QMOPT_DQALLOC))
+			return ENOENT;
 
-		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		ASSERT(tp);
+		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+					dqp->q_fileoffset, &bp);
 		if (error)
-			return (error);
-		ASSERT(nmaps == 1);
-		ASSERT(map.br_blockcount == 1);
+			return error;
+		tp = *tpp;
+	} else {
+		trace_xfs_dqtobp_read(dqp);
 
 		/*
-		 * offset of dquot in the (fixed sized) dquot chunk.
+		 * store the blkno etc so that we don't have to do the
+		 * mapping all the time
 		 */
-		dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
-			sizeof(xfs_dqblk_t);
-		if (map.br_startblock == HOLESTARTBLOCK) {
-			/*
-			 * We don't allocate unless we're asked to
-			 */
-			if (!(flags & XFS_QMOPT_DQALLOC))
-				return (ENOENT);
-
-			ASSERT(tp);
-			if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-						dqp->q_fileoffset, &bp)))
-				return (error);
-			tp = *tpp;
-			newdquot = B_TRUE;
-		} else {
-			/*
-			 * store the blkno etc so that we don't have to do the
-			 * mapping all the time
-			 */
-			dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-		}
-	}
-	ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-	ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-
-	/*
-	 * Read in the buffer, unless we've just done the allocation
-	 * (in which case we already have the buf).
-	 */
-	if (!newdquot) {
-		trace_xfs_dqtobp_read(dqp);
+		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
@@ -571,13 +533,14 @@ xfs_qm_dqtobp(
 		if (error || !bp)
 			return XFS_ERROR(error);
 	}
+
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 
 	/*
 	 * calculate the location of the dquot inside the buffer.
 	 */
-	ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+	ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
 
 	/*
 	 * A simple sanity check in case we got a corrupted dquot...
@@ -1141,6 +1104,46 @@ xfs_qm_dqrele(
 	xfs_qm_dqput(dqp);
 }
 
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
+{
+	xfs_dq_logitem_t	*qip = (struct xfs_dq_logitem *)lip;
+	xfs_dquot_t		*dqp = qip->qli_dquot;
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if ((lip->li_flags & XFS_LI_IN_AIL) &&
+	    lip->li_lsn == qip->qli_flush_lsn) {
+
+		/* xfs_trans_ail_delete() drops the AIL lock. */
+		spin_lock(&ailp->xa_lock);
+		if (lip->li_lsn == qip->qli_flush_lsn)
+			xfs_trans_ail_delete(ailp, lip);
+		else
+			spin_unlock(&ailp->xa_lock);
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
 
 /*
  * Write a modified dquot to disk.
@@ -1155,18 +1158,18 @@ xfs_qm_dqflush(
 	xfs_dquot_t		*dqp,
 	uint			flags)
 {
-	xfs_mount_t		*mp;
-	xfs_buf_t		*bp;
-	xfs_disk_dquot_t	*ddqp;
+	struct xfs_mount	*mp = dqp->q_mount;
+	struct xfs_buf		*bp;
+	struct xfs_disk_dquot	*ddqp;
 	int			error;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(!completion_done(&dqp->q_flush));
+
 	trace_xfs_dqflush(dqp);
 
 	/*
-	 * If not dirty, or it's pinned and we are not supposed to
-	 * block, nada.
+	 * If not dirty, or it's pinned and we are not supposed to block, nada.
 	 */
 	if (!XFS_DQ_IS_DIRTY(dqp) ||
 	    (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1180,40 +1183,46 @@ xfs_qm_dqflush(
 	 * down forcibly. If that's the case we must not write this dquot
 	 * to disk, because the log record didn't make it to disk!
 	 */
-	if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
-		dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		dqp->dq_flags &= ~XFS_DQ_DIRTY;
 		xfs_dqfunlock(dqp);
 		return XFS_ERROR(EIO);
 	}
 
 	/*
 	 * Get the buffer containing the on-disk dquot
-	 * We don't need a transaction envelope because we know that the
-	 * the ondisk-dquot has already been allocated for.
 	 */
-	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+	if (error) {
 		ASSERT(error != ENOENT);
-		/*
-		 * Quotas could have gotten turned off (ESRCH)
-		 */
 		xfs_dqfunlock(dqp);
-		return (error);
+		return error;
 	}
 
-	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
-			   0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-		xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
+	/*
+	 * Calculate the location of the dquot inside the buffer.
+	 */
+	ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot..
+	 */
+	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+			   XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+		xfs_buf_relse(bp);
+		xfs_dqfunlock(dqp);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		return XFS_ERROR(EIO);
 	}
 
 	/* This is the only portion of data that needs to persist */
-	memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+	memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
 
 	/*
 	 * Clear the dirty field and remember the flush lsn for later use.
 	 */
-	dqp->dq_flags &= ~(XFS_DQ_DIRTY);
-	mp = dqp->q_mount;
+	dqp->dq_flags &= ~XFS_DQ_DIRTY;
 
 	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
 					&dqp->q_logitem.qli_item.li_lsn);
@@ -1222,8 +1231,9 @@ xfs_qm_dqflush(
 	 * Attach an iodone routine so that we can remove this dquot from the
 	 * AIL and release the flush lock once the dquot is synced to disk.
 	 */
-	xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
-			      xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+	xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+				  &dqp->q_logitem.qli_item);
+
 	/*
 	 * If the buffer is pinned then push on the log so we won't
 	 * get stuck waiting in the write for too long.
@@ -1247,50 +1257,6 @@ xfs_qm_dqflush(
 
 }
 
-/*
- * This is the dquot flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk.  It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
-	xfs_buf_t		*bp,
-	xfs_dq_logitem_t	*qip)
-{
-	xfs_dquot_t		*dqp;
-	struct xfs_ail		*ailp;
-
-	dqp = qip->qli_dquot;
-	ailp = qip->qli_item.li_ailp;
-
-	/*
-	 * We only want to pull the item from the AIL if its
-	 * location in the log has not changed since we started the flush.
-	 * Thus, we only bother if the dquot's lsn has
-	 * not changed. First we check the lsn outside the lock
-	 * since it's cheaper, and then we recheck while
-	 * holding the lock before removing the dquot from the AIL.
-	 */
-	if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-	    qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-
-		/* xfs_trans_ail_delete() drops the AIL lock. */
-		spin_lock(&ailp->xa_lock);
-		if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-			xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-		else
-			spin_unlock(&ailp->xa_lock);
-	}
-
-	/*
-	 * Release the dq's flush lock since we're done with it.
-	 */
-	xfs_dqfunlock(dqp);
-}
-
 int
 xfs_qm_dqlock_nowait(
 	xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 8d89a24ae32..2a1f3dc10a0 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
 
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
 /*
  * returns the number of iovecs needed to log the given dquot item.
  */
-/* ARGSUSED */
 STATIC uint
 xfs_qm_dquot_logitem_size(
-	xfs_dq_logitem_t	*logitem)
+	struct xfs_log_item	*lip)
 {
 	/*
 	 * we need only two iovecs, one for the format, one for the real thing
 	 */
-	return (2);
+	return 2;
 }
 
 /*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
  */
 STATIC void
 xfs_qm_dquot_logitem_format(
-	xfs_dq_logitem_t	*logitem,
-	xfs_log_iovec_t		*logvec)
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*logvec)
 {
-	ASSERT(logitem);
-	ASSERT(logitem->qli_dquot);
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
 
-	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+	logvec->i_addr = &qlip->qli_format;
 	logvec->i_len  = sizeof(xfs_dq_logformat_t);
 	logvec->i_type = XLOG_REG_TYPE_QFORMAT;
 	logvec++;
-	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+	logvec->i_addr = &qlip->qli_dquot->q_core;
 	logvec->i_len  = sizeof(xfs_disk_dquot_t);
 	logvec->i_type = XLOG_REG_TYPE_DQUOT;
 
-	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
-	logitem->qli_format.qlf_size = 2;
+	ASSERT(2 == lip->li_desc->lid_size);
+	qlip->qli_format.qlf_size = 2;
 
 }
 
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
  */
 STATIC void
 xfs_qm_dquot_logitem_pin(
-	xfs_dq_logitem_t *logitem)
+	struct xfs_log_item	*lip)
 {
-	xfs_dquot_t *dqp = logitem->qli_dquot;
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	atomic_inc(&dqp->q_pincount);
@@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin(
  * dquot must have been previously pinned with a call to
  * xfs_qm_dquot_logitem_pin().
  */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-	xfs_dq_logitem_t *logitem)
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	xfs_dquot_t *dqp = logitem->qli_dquot;
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 
 	ASSERT(atomic_read(&dqp->q_pincount) > 0);
 	if (atomic_dec_and_test(&dqp->q_pincount))
 		wake_up(&dqp->q_pinwait);
 }
 
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
-	xfs_dq_logitem_t *logitem,
-	xfs_trans_t	 *tp)
-{
-	xfs_qm_dquot_logitem_unpin(logitem);
-}
-
 /*
  * Given the logitem, this writes the corresponding dquot entry to disk
  * asynchronously. This is called with the dquot entry securely locked;
@@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
  */
 STATIC void
 xfs_qm_dquot_logitem_push(
-	xfs_dq_logitem_t	*logitem)
+	struct xfs_log_item	*lip)
 {
-	xfs_dquot_t	*dqp;
-	int		error;
-
-	dqp = logitem->qli_dquot;
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
+	int			error;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(!completion_done(&dqp->q_flush));
@@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push(
 	xfs_dqunlock(dqp);
 }
 
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_dquot_logitem_committed(
-	xfs_dq_logitem_t	*l,
+	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
 	/*
 	 * We always re-log the entire dquot when it becomes dirty,
 	 * so, the latest copy _is_ the only one that matters.
 	 */
-	return (lsn);
+	return lsn;
 }
 
-
 /*
  * This is called to wait for the given dquot to be unpinned.
  * Most of these pin/unpin routines are plagiarized from inode code.
  */
 void
 xfs_qm_dqunpin_wait(
-	xfs_dquot_t	*dqp)
+	struct xfs_dquot	*dqp)
 {
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	if (atomic_read(&dqp->q_pincount) == 0)
@@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait(
  */
 STATIC void
 xfs_qm_dquot_logitem_pushbuf(
-	xfs_dq_logitem_t    *qip)
+	struct xfs_log_item	*lip)
 {
-	xfs_dquot_t	*dqp;
-	xfs_mount_t	*mp;
-	xfs_buf_t	*bp;
+	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
+	struct xfs_dquot	*dqp = qlip->qli_dquot;
+	struct xfs_buf		*bp;
 
-	dqp = qip->qli_dquot;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	/*
@@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if (completion_done(&dqp->q_flush)  ||
-	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+	if (completion_done(&dqp->q_flush) ||
+	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_dqunlock(dqp);
 		return;
 	}
-	mp = dqp->q_mount;
-	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-			mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+
+	bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
+			dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
 	xfs_dqunlock(dqp);
 	if (!bp)
 		return;
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
 	xfs_buf_relse(bp);
-	return;
-
 }
 
 /*
@@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
  */
 STATIC uint
 xfs_qm_dquot_logitem_trylock(
-	xfs_dq_logitem_t	*qip)
+	struct xfs_log_item	*lip)
 {
-	xfs_dquot_t		*dqp;
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 
-	dqp = qip->qli_dquot;
 	if (atomic_read(&dqp->q_pincount) > 0)
 		return XFS_ITEM_PINNED;
 
-	if (! xfs_qm_dqlock_nowait(dqp))
+	if (!xfs_qm_dqlock_nowait(dqp))
 		return XFS_ITEM_LOCKED;
 
 	if (!xfs_dqflock_nowait(dqp)) {
@@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
 		return XFS_ITEM_PUSHBUF;
 	}
 
-	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
 	return XFS_ITEM_SUCCESS;
 }
 
-
 /*
  * Unlock the dquot associated with the log item.
  * Clear the fields of the dquot and dquot log item that
@@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
  */
 STATIC void
 xfs_qm_dquot_logitem_unlock(
-	xfs_dq_logitem_t    *ql)
+	struct xfs_log_item	*lip)
 {
-	xfs_dquot_t	*dqp;
+	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 
-	ASSERT(ql != NULL);
-	dqp = ql->qli_dquot;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	/*
@@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock(
 	xfs_dqunlock(dqp);
 }
 
-
 /*
  * this needs to stamp an lsn into the dquot, I think.
  * rpc's that look at user dquot's would then have to
  * push on the dependency recorded in the dquot
  */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_committing(
-	xfs_dq_logitem_t	*l,
+	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
-	return;
 }
 
-
 /*
  * This is the ops vector for dquots
  */
 static struct xfs_item_ops xfs_dquot_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_dquot_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
-					xfs_qm_dquot_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))
-					xfs_qm_dquot_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_dquot_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
-					xfs_qm_dquot_logitem_pushbuf,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_dquot_logitem_committing
+	.iop_size	= xfs_qm_dquot_logitem_size,
+	.iop_format	= xfs_qm_dquot_logitem_format,
+	.iop_pin	= xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= xfs_qm_dquot_logitem_unpin,
+	.iop_trylock	= xfs_qm_dquot_logitem_trylock,
+	.iop_unlock	= xfs_qm_dquot_logitem_unlock,
+	.iop_committed	= xfs_qm_dquot_logitem_committed,
+	.iop_push	= xfs_qm_dquot_logitem_push,
+	.iop_pushbuf	= xfs_qm_dquot_logitem_pushbuf,
+	.iop_committing = xfs_qm_dquot_logitem_committing
 };
 
 /*
@@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
  */
 void
 xfs_qm_dquot_logitem_init(
-	struct xfs_dquot *dqp)
+	struct xfs_dquot	*dqp)
 {
-	xfs_dq_logitem_t  *lp;
-	lp = &dqp->q_logitem;
+	struct xfs_dq_logitem	*lp = &dqp->q_logitem;
 
 	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
 					&xfs_dquot_item_ops);
@@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init(
 
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
 
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
 /*
  * This returns the number of iovecs needed to log the given quotaoff item.
  * We only need 1 iovec for an quotaoff item.  It just logs the
  * quotaoff_log_format structure.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_size(
+	struct xfs_log_item	*lip)
 {
-	return (1);
+	return 1;
 }
 
 /*
@@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
  * slots in the quotaoff item have been filled.
  */
 STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
-			   xfs_log_iovec_t	*log_vector)
+xfs_qm_qoff_logitem_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
 {
-	ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+	struct xfs_qoff_logitem	*qflip = QOFF_ITEM(lip);
+
+	ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
 
-	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+	log_vector->i_addr = &qflip->qql_format;
 	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
 	log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
-	qf->qql_format.qf_size = 1;
+	qflip->qql_format.qf_size = 1;
 }
 
-
 /*
  * Pinning has no meaning for an quotaoff item, so just return.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_pin(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
-
 /*
  * Since pinning has no meaning for an quotaoff item, unpinning does
  * not either.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	return;
-}
-
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
-	return;
 }
 
 /*
  * Quotaoff items have no locking, so just return success.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_trylock(
+	struct xfs_log_item	*lip)
 {
 	return XFS_ITEM_LOCKED;
 }
@@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
  * Quotaoff items have no locking or pushing, so return failure
  * so that the caller doesn't bother with us.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unlock(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
 /*
  * The quotaoff-start-item is logged only once and cannot be moved in the log,
  * so simply return the lsn at which it's been logged.
  */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+xfs_qm_qoff_logitem_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
-	return (lsn);
+	return lsn;
 }
 
 /*
  * There isn't much you can do to push on an quotaoff item.  It is simply
  * stuck waiting for the log to be flushed to disk.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_push(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
 
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_qoffend_logitem_committed(
-	xfs_qoff_logitem_t *qfe,
-	xfs_lsn_t lsn)
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
-	xfs_qoff_logitem_t	*qfs;
-	struct xfs_ail		*ailp;
+	struct xfs_qoff_logitem	*qfe = QOFF_ITEM(lip);
+	struct xfs_qoff_logitem	*qfs = qfe->qql_start_lip;
+	struct xfs_ail		*ailp = qfs->qql_item.li_ailp;
 
-	qfs = qfe->qql_start_lip;
-	ailp = qfs->qql_item.li_ailp;
-	spin_lock(&ailp->xa_lock);
 	/*
 	 * Delete the qoff-start logitem from the AIL.
 	 * xfs_trans_ail_delete() drops the AIL lock.
 	 */
+	spin_lock(&ailp->xa_lock);
 	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+
 	kmem_free(qfs);
 	kmem_free(qfe);
 	return (xfs_lsn_t)-1;
@@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed(
  * (truly makes the quotaoff irrevocable).  If we do something else,
  * then maybe we don't need two.
  */
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
-	return;
-}
-
-/* ARGSUSED */
 STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+xfs_qm_qoff_logitem_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
 {
-	return;
 }
 
 static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_qoff_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
-					xfs_qm_qoff_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoffend_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoffend_logitem_committing
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_trylock	= xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoffend_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
 };
 
 /*
  * This is the ops vector shared by all quotaoff-start log items.
  */
 static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_qoff_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
-					xfs_qm_qoff_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoff_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoff_logitem_committing
+	.iop_size	= xfs_qm_qoff_logitem_size,
+	.iop_format	= xfs_qm_qoff_logitem_format,
+	.iop_pin	= xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= xfs_qm_qoff_logitem_unpin,
+	.iop_trylock	= xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= xfs_qm_qoff_logitem_committed,
+	.iop_push	= xfs_qm_qoff_logitem_push,
+	.iop_committing = xfs_qm_qoff_logitem_committing
 };
 
 /*
  * Allocate and initialize an quotaoff item of the correct quota type(s).
  */
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
 xfs_qm_qoff_logitem_init(
-	struct xfs_mount *mp,
-	xfs_qoff_logitem_t *start,
-	uint flags)
+	struct xfs_mount	*mp,
+	struct xfs_qoff_logitem	*start,
+	uint			flags)
 {
-	xfs_qoff_logitem_t	*qf;
+	struct xfs_qoff_logitem	*qf;
 
-	qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+	qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
 
 	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
 			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
@@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init(
 	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
 	qf->qql_format.qf_flags = flags;
 	qf->qql_start_lip = start;
-	return (qf);
+	return qf;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 67c018392d6..f8e854b4fde 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -62,8 +55,6 @@ uint		ndquot;
 kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
 
-static cred_t	xfs_zerocr;
-
 STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
@@ -844,7 +835,7 @@ xfs_qm_dqattach_locked(
 			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
 						flags & XFS_QMOPT_DQALLOC,
 						ip->i_udquot, &ip->i_gdquot) :
-			xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+			xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
 						flags & XFS_QMOPT_DQALLOC,
 						ip->i_udquot, &ip->i_gdquot);
 		/*
@@ -1206,87 +1197,6 @@ xfs_qm_list_destroy(
 	mutex_destroy(&(list->qh_lock));
 }
 
-
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-	xfs_inode_t	*ip,
-	xfs_dquot_t	**O_udqpp,
-	xfs_dquot_t	**O_gdqpp)
-{
-	int		error;
-	xfs_mount_t	*mp;
-	xfs_dquot_t	*udqp, *gdqp;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	mp = ip->i_mount;
-	udqp = NULL;
-	gdqp = NULL;
-
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(ip->i_udquot == NULL);
-		/*
-		 * We want the dquot allocated if it doesn't exist.
-		 */
-		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-					 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-					 &udqp))) {
-			/*
-			 * Shouldn't be able to turn off quotas here.
-			 */
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(udqp);
-	}
-
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(ip->i_gdquot == NULL);
-		if (udqp)
-			xfs_dqunlock(udqp);
-		error = XFS_IS_GQUOTA_ON(mp) ?
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_gid, XFS_DQ_GROUP,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp) :
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_projid, XFS_DQ_PROJ,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp);
-		if (error) {
-			if (udqp)
-				xfs_qm_dqrele(udqp);
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(gdqp);
-
-		/* Reacquire the locks in the right order */
-		if (udqp) {
-			if (! xfs_qm_dqlock_nowait(udqp)) {
-				xfs_dqunlock(gdqp);
-				xfs_dqlock(udqp);
-				xfs_dqlock(gdqp);
-			}
-		}
-	}
-
-	*O_udqpp = udqp;
-	*O_gdqpp = gdqp;
-
-#ifdef QUOTADEBUG
-	if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-	if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-	return 0;
-}
-
 /*
  * Create an inode and return with a reference already taken, but unlocked
  * This is how we create quota inodes
@@ -1312,8 +1222,8 @@ xfs_qm_qino_alloc(
 		return error;
 	}
 
-	if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
-				   &xfs_zerocr, 0, 1, ip, &committed))) {
+	error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
+	if (error) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT);
 		return error;
@@ -1497,7 +1407,7 @@ xfs_qm_dqiterate(
 				  maxlblkcnt - lblkno,
 				  XFS_BMAPI_METADATA,
 				  NULL,
-				  0, map, &nmaps, NULL, NULL);
+				  0, map, &nmaps, NULL);
 		xfs_iunlock(qip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
@@ -1523,7 +1433,7 @@ xfs_qm_dqiterate(
 				rablkcnt =  map[i+1].br_blockcount;
 				rablkno = map[i+1].br_startblock;
 				while (rablkcnt--) {
-					xfs_baread(mp->m_ddev_targp,
+					xfs_buf_readahead(mp->m_ddev_targp,
 					       XFS_FSB_TO_DADDR(mp, rablkno),
 					       mp->m_quotainfo->qi_dqchunklen);
 					rablkno++;
@@ -1553,18 +1463,34 @@ xfs_qm_dqiterate(
 
 /*
  * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
- * this updates its incore copy as well as the buffer copy. This is
- * so that once the quotacheck is done, we can just log all the buffers,
- * as opposed to logging numerous updates to individual dquots.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
  */
-STATIC void
+STATIC int
 xfs_qm_quotacheck_dqadjust(
-	xfs_dquot_t		*dqp,
+	struct xfs_inode	*ip,
+	xfs_dqid_t		id,
+	uint			type,
 	xfs_qcnt_t		nblks,
 	xfs_qcnt_t		rtblks)
 {
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget(mp, ip, id, type,
+			     XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+	if (error) {
+		/*
+		 * Shouldn't be able to turn off quotas here.
+		 */
+		ASSERT(error != ESRCH);
+		ASSERT(error != ENOENT);
+		return error;
+	}
 
 	trace_xfs_dqadjust(dqp);
 
@@ -1589,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
 	 * There are no timers for the default values set in the root dquot.
 	 */
 	if (dqp->q_core.d_id) {
-		xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
-		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+		xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+		xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
 	}
 
 	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_qm_dqput(dqp);
+	return 0;
 }
 
 STATIC int
@@ -1636,8 +1564,7 @@ xfs_qm_dqusage_adjust(
 	int		*res)		/* result code value */
 {
 	xfs_inode_t	*ip;
-	xfs_dquot_t	*udqp, *gdqp;
-	xfs_qcnt_t	nblks, rtblks;
+	xfs_qcnt_t	nblks, rtblks = 0;
 	int		error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1657,49 +1584,24 @@ xfs_qm_dqusage_adjust(
 	 * the case in all other instances. It's OK that we do this because
 	 * quotacheck is done only at mount time.
 	 */
-	if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
+	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+	if (error) {
 		*res = BULKSTAT_RV_NOTHING;
 		return error;
 	}
 
-	/*
-	 * Obtain the locked dquots. In case of an error (eg. allocation
-	 * fails for ENOSPC), we return the negative of the error number
-	 * to bulkstat, so that it can get propagated to quotacheck() and
-	 * making us disable quotas for the file system.
-	 */
-	if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-		xfs_iput(ip, XFS_ILOCK_EXCL);
-		*res = BULKSTAT_RV_GIVEUP;
-		return error;
-	}
+	ASSERT(ip->i_delayed_blks == 0);
 
-	rtblks = 0;
-	if (! XFS_IS_REALTIME_INODE(ip)) {
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-	} else {
+	if (XFS_IS_REALTIME_INODE(ip)) {
 		/*
 		 * Walk thru the extent list and count the realtime blocks.
 		 */
-		if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
-			xfs_iput(ip, XFS_ILOCK_EXCL);
-			if (udqp)
-				xfs_qm_dqput(udqp);
-			if (gdqp)
-				xfs_qm_dqput(gdqp);
-			*res = BULKSTAT_RV_GIVEUP;
-			return error;
-		}
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+		error = xfs_qm_get_rtblks(ip, &rtblks);
+		if (error)
+			goto error0;
 	}
-	ASSERT(ip->i_delayed_blks == 0);
 
-	/*
-	 * We can't release the inode while holding its dquot locks.
-	 * The inode can go into inactive and might try to acquire the dquotlocks.
-	 * So, just unlock here and do a vn_rele at the end.
-	 */
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
 
 	/*
 	 * Add the (disk blocks and inode) resources occupied by this
@@ -1714,26 +1616,36 @@ xfs_qm_dqusage_adjust(
 	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
 	 */
 	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(udqp);
-		xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
-		xfs_qm_dqput(udqp);
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+						   XFS_DQ_USER, nblks, rtblks);
+		if (error)
+			goto error0;
 	}
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(gdqp);
-		xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
-		xfs_qm_dqput(gdqp);
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+						   XFS_DQ_GROUP, nblks, rtblks);
+		if (error)
+			goto error0;
 	}
-	/*
-	 * Now release the inode. This will send it to 'inactive', and
-	 * possibly even free blocks.
-	 */
-	IRELE(ip);
 
-	/*
-	 * Goto next inode.
-	 */
+	if (XFS_IS_PQUOTA_ON(mp)) {
+		error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
+						   XFS_DQ_PROJ, nblks, rtblks);
+		if (error)
+			goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
 	*res = BULKSTAT_RV_DIDONE;
 	return 0;
+
+error0:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+	*res = BULKSTAT_RV_GIVEUP;
+	return error;
 }
 
 /*
@@ -2229,7 +2141,7 @@ xfs_qm_write_sb_changes(
 
 
 /*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * Given an inode, a uid, gid and prid make sure that we have
  * allocated relevant dquot(s) on disk, and that we won't exceed inode
  * quotas by creating this file.
  * This also attaches dquot(s) to the given inode after locking it,
@@ -2337,7 +2249,7 @@ xfs_qm_vop_dqalloc(
 			xfs_dqunlock(gq);
 		}
 	} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-		if (ip->i_d.di_projid != prid) {
+		if (xfs_get_projid(ip) != prid) {
 			xfs_iunlock(ip, lockflags);
 			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
 						 XFS_DQ_PROJ,
@@ -2459,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
 	}
 	if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
 		if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-		     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))
+		     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
 			prjflags = XFS_QMOPT_ENOSPC;
 
 		if (prjflags ||
@@ -2563,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
 		ip->i_gdquot = gdqp;
 		ASSERT(XFS_IS_OQUOTA_ON(mp));
 		ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-			ip->i_d.di_gid : ip->i_d.di_projid) ==
+			ip->i_d.di_gid : xfs_get_projid(ip)) ==
 				be32_to_cpu(gdqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c1279..45b5cb1788a 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
@@ -91,7 +81,7 @@ xfs_qm_statvfs(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_dquot_t		*dqp;
 
-	if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
+	if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
 		xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
 		xfs_qm_dqput(dqp);
 	}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 3d1fc79532e..8671a0b3264 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index b4487764e92..bdebc183223 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -248,40 +238,74 @@ out_unlock:
 	return error;
 }
 
+STATIC int
+xfs_qm_scall_trunc_qfile(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (ino == NULLFSINO)
+		return 0;
+
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+				  XFS_TRANS_PERM_LOG_RES,
+				  XFS_ITRUNCATE_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		goto out_put;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip);
+
+	error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				     XFS_TRANS_ABORT);
+		goto out_unlock;
+	}
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+	IRELE(ip);
+	return error;
+}
+
 int
 xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
 	int		error = 0, error2 = 0;
-	xfs_inode_t	*qip;
 
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
 		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
 	}
 
-	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip);
-		if (!error) {
-			error = xfs_truncate_file(mp, qip);
-			IRELE(qip);
-		}
-	}
-
-	if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
-	    mp->m_sb.sb_gquotino != NULLFSINO) {
-		error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip);
-		if (!error2) {
-			error2 = xfs_truncate_file(mp, qip);
-			IRELE(qip);
-		}
-	}
+	if (flags & XFS_DQ_USER)
+		error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+	if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+		error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
 
 	return error ? error : error2;
 }
 
-
 /*
  * Switch on (a given) quota enforcement for a filesystem.  This takes
  * effect immediately.
@@ -786,9 +810,9 @@ xfs_qm_export_dquot(
 	}
 
 #ifdef DEBUG
-	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
 	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
-			(dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
 	    dst->d_id != 0) {
 		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
@@ -809,17 +833,17 @@ xfs_qm_export_qtype_flags(
 	/*
 	 * Can't be more than one, or none.
 	 */
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_USER_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_USER_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_USER_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_USER_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
 
 	return (flags & XFS_DQ_USER) ?
-		XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
+		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
 }
 
 STATIC uint
@@ -830,16 +854,16 @@ xfs_qm_export_flags(
 
 	uflags = 0;
 	if (flags & XFS_UQUOTA_ACCT)
-		uflags |= XFS_QUOTA_UDQ_ACCT;
+		uflags |= FS_QUOTA_UDQ_ACCT;
 	if (flags & XFS_PQUOTA_ACCT)
-		uflags |= XFS_QUOTA_PDQ_ACCT;
+		uflags |= FS_QUOTA_PDQ_ACCT;
 	if (flags & XFS_GQUOTA_ACCT)
-		uflags |= XFS_QUOTA_GDQ_ACCT;
+		uflags |= FS_QUOTA_GDQ_ACCT;
 	if (flags & XFS_UQUOTA_ENFD)
-		uflags |= XFS_QUOTA_UDQ_ENFD;
+		uflags |= FS_QUOTA_UDQ_ENFD;
 	if (flags & (XFS_OQUOTA_ENFD)) {
 		uflags |= (flags & XFS_GQUOTA_ACCT) ?
-			XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
+			FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
 	}
 	return (uflags);
 }
@@ -851,21 +875,14 @@ xfs_dqrele_inode(
 	struct xfs_perag	*pag,
 	int			flags)
 {
-	int			error;
-
 	/* skip quota inodes */
 	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
 	    ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
 		ASSERT(ip->i_udquot == NULL);
 		ASSERT(ip->i_gdquot == NULL);
-		read_unlock(&pag->pag_ici_lock);
 		return 0;
 	}
 
-	error = xfs_sync_inode_valid(ip, pag);
-	if (error)
-		return error;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
 		xfs_qm_dqrele(ip->i_udquot);
@@ -875,8 +892,7 @@ xfs_dqrele_inode(
 		xfs_qm_dqrele(ip->i_gdquot);
 		ip->i_gdquot = NULL;
 	}
-	xfs_iput(ip, XFS_ILOCK_EXCL);
-
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return 0;
 }
 
@@ -893,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
-				XFS_ICI_NO_TAG, 0, NULL);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
 }
 
 /*------------------------------------------------------------------------*/
@@ -1143,13 +1158,14 @@ xfs_qm_internalqcheck_adjust(
 	 * of those now.
 	 */
 	if (! ipreleased) {
-		xfs_iput(ip, lock_flags);
+		xfs_iunlock(ip, lock_flags);
+		IRELE(ip);
 		ipreleased = B_TRUE;
 		goto again;
 	}
 	xfs_qm_internalqcheck_get_dquots(mp,
 					(xfs_dqid_t) ip->i_d.di_uid,
-					(xfs_dqid_t) ip->i_d.di_projid,
+					(xfs_dqid_t) xfs_get_projid(ip),
 					(xfs_dqid_t) ip->i_d.di_gid,
 					&ud, &gd);
 	if (XFS_IS_UQUOTA_ON(mp)) {
@@ -1160,7 +1176,8 @@ xfs_qm_internalqcheck_adjust(
 		ASSERT(gd);
 		xfs_qm_internalqcheck_dqadjust(ip, gd);
 	}
-	xfs_iput(ip, lock_flags);
+	xfs_iunlock(ip, lock_flags);
+	IRELE(ip);
 	*res = BULKSTAT_RV_DIDONE;
 	return (0);
 }
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 061d827da33..7de91d1b75c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -59,16 +49,14 @@ xfs_trans_dqjoin(
 	xfs_trans_t	*tp,
 	xfs_dquot_t	*dqp)
 {
-	xfs_dq_logitem_t    *lp = &dqp->q_logitem;
-
 	ASSERT(dqp->q_transp != tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(lp->qli_dquot == dqp);
+	ASSERT(dqp->q_logitem.qli_dquot == dqp);
 
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+	xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
 
 	/*
 	 * Initialize i_transp so we can later determine if this dquot is
@@ -93,16 +81,11 @@ xfs_trans_log_dquot(
 	xfs_trans_t	*tp,
 	xfs_dquot_t	*dqp)
 {
-	xfs_log_item_desc_t	*lidp;
-
 	ASSERT(dqp->q_transp == tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 
 /*
@@ -874,9 +857,8 @@ xfs_trans_get_qoff_item(
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
-
-	return (q);
+	xfs_trans_add_item(tp, &q->qql_item);
+	return q;
 }
 
 
@@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item(
 	xfs_trans_t		*tp,
 	xfs_qoff_logitem_t	*qlp)
 {
-	xfs_log_item_desc_t	*lidp;
-
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 
 STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee0..975aa10e1a4 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4e..63c7a1a6c02 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,15 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
+
+	/* buffer cache index */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
+	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+
+	/* for rcu-safe freeing */
+	struct rcu_head	rcu_head;
 #endif
 	int		pagb_count;	/* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a7fbe8a99b1..112abc439ca 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -680,7 +675,7 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	gtbnoa;		/* aligned ... */
 	xfs_extlen_t	gtdiff;		/* difference to right side entry */
 	xfs_extlen_t	gtlen;		/* length of right side entry */
-	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_extlen_t	gtlena = 0;	/* aligned ... */
 	xfs_agblock_t	gtnew;		/* useful start bno of right side */
 	int		error;		/* error code */
 	int		i;		/* result code, temporary */
@@ -688,10 +683,8 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	ltbno;		/* start bno of left side entry */
 	xfs_agblock_t	ltbnoa;		/* aligned ... */
 	xfs_extlen_t	ltdiff;		/* difference to left side entry */
-	/*REFERENCED*/
-	xfs_agblock_t	ltend;		/* end bno of left side entry */
 	xfs_extlen_t	ltlen;		/* length of left side entry */
-	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_extlen_t	ltlena = 0;	/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
 #if defined(DEBUG) && defined(__KERNEL__)
@@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near(
 		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		ltend = ltbno + ltlen;
-		ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
 		if (!xfs_alloc_fix_minleft(args)) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near(
 		 */
 		args->agbno = bnew;
 		ASSERT(bnew >= ltbno);
-		ASSERT(bnew + blen <= ltend);
+		ASSERT(bnew + blen <= ltbno + ltlen);
 		/*
 		 * Set up a cursor for the by-bno tree.
 		 */
@@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
 	/*
 	 * Fix up the length and compute the useful address.
 	 */
-	ltend = ltbno + ltlen;
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
 	if (!xfs_alloc_fix_minleft(args)) {
@@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
 		ltlen, &ltnew);
 	ASSERT(ltnew >= ltbno);
-	ASSERT(ltnew + rlen <= ltend);
+	ASSERT(ltnew + rlen <= ltbno + ltlen);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 	args->agbno = ltnew;
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 6d05199b667..895009a9727 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -27,16 +27,16 @@ struct xfs_busy_extent;
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
  */
-typedef enum xfs_alloctype
-{
-	XFS_ALLOCTYPE_ANY_AG,		/* allocate anywhere, use rotor */
-	XFS_ALLOCTYPE_FIRST_AG,		/* ... start at ag 0 */
-	XFS_ALLOCTYPE_START_AG,		/* anywhere, start in this a.g. */
-	XFS_ALLOCTYPE_THIS_AG,		/* anywhere in this a.g. */
-	XFS_ALLOCTYPE_START_BNO,	/* near this block else anywhere */
-	XFS_ALLOCTYPE_NEAR_BNO,		/* in this a.g. and near this block */
-	XFS_ALLOCTYPE_THIS_BNO		/* at exactly this block */
-} xfs_alloctype_t;
+#define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO	0x40	/* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
 
 #define XFS_ALLOC_TYPES \
 	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 83f49421875..3916925e258 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -285,38 +280,6 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-STATIC int
-xfs_allocbt_kill_root(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	union xfs_btree_ptr	*newroot)
-{
-	int			error;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-	/*
-	 * Update the root pointer, decreasing the level by 1 and then
-	 * free the old root.
-	 */
-	xfs_allocbt_set_root(cur, newroot, -1);
-	error = xfs_allocbt_free_block(cur, bp);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	XFS_BTREE_STATS_INC(cur, free);
-
-	xfs_btree_setbuf(cur, level, NULL);
-	cur->bc_nlevels--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -428,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 
 	.dup_cursor		= xfs_allocbt_dup_cursor,
 	.set_root		= xfs_allocbt_set_root,
-	.kill_root		= xfs_allocbt_kill_root,
 	.alloc_block		= xfs_allocbt_alloc_block,
 	.free_block		= xfs_allocbt_free_block,
 	.update_lastrec		= xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c4..c8637537881 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
 		return (error);
 	}
 
-	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp);
 
 	/*
 	 * If the attribute list is non-existent or a shortform list,
@@ -362,16 +355,15 @@ xfs_attr_set_int(
 			if (mp->m_flags & XFS_MOUNT_WSYNC) {
 				xfs_trans_set_sync(args.trans);
 			}
+
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_trans_ichgtime(args.trans, dp,
+							XFS_ICHGTIME_CHG);
+			}
 			err2 = xfs_trans_commit(args.trans,
 						 XFS_TRANS_RELEASE_LOG_RES);
 			xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-			/*
-			 * Hit the inode change time.
-			 */
-			if (!error && (flags & ATTR_KERNOTIME) == 0) {
-				xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-			}
 			return(error == 0 ? err2 : error);
 		}
 
@@ -396,10 +388,8 @@ xfs_attr_set_int(
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args.trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args.trans, dp);
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
@@ -429,6 +419,9 @@ xfs_attr_set_int(
 		xfs_trans_set_sync(args.trans);
 	}
 
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
 	/*
 	 * Commit the last in the sequence of transactions.
 	 */
@@ -436,13 +429,6 @@ xfs_attr_set_int(
 	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-	/*
-	 * Hit the inode change time.
-	 */
-	if (!error && (flags & ATTR_KERNOTIME) == 0) {
-		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-	}
-
 	return(error);
 
 out:
@@ -544,8 +530,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks not allocate in the common case.
 	 */
-	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(args.trans, dp);
+	xfs_trans_ijoin(args.trans, dp);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -577,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 		xfs_trans_set_sync(args.trans);
 	}
 
+	if ((flags & ATTR_KERNOTIME) == 0)
+		xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
 	/*
 	 * Commit the last in the sequence of transactions.
 	 */
@@ -584,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-	/*
-	 * Hit the inode change time.
-	 */
-	if (!error && (flags & ATTR_KERNOTIME) == 0) {
-		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-	}
-
 	return(error);
 
 out:
@@ -821,8 +802,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	 * No need to make quota reservations here. We expect to release some
 	 * blocks, not allocate, in the common case.
 	 */
-	xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(trans, dp);
+	xfs_trans_ijoin(trans, dp);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -981,10 +961,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
 
 		/*
 		 * Commit the current trans (including the inode) and start
@@ -1085,10 +1063,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			 * and started a new one.  We need the inode to be
 			 * in all transactions.
 			 */
-			if (committed) {
-				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(args->trans, dp);
-			}
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
 		} else
 			xfs_da_buf_done(bp);
 
@@ -1161,10 +1137,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
 	} else
 		xfs_da_buf_done(bp);
 	return(0);
@@ -1317,10 +1291,8 @@ restart:
 			 * and started a new one.  We need the inode to be
 			 * in all transactions.
 			 */
-			if (committed) {
-				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(args->trans, dp);
-			}
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
 
 			/*
 			 * Commit the node conversion and start the next
@@ -1356,10 +1328,8 @@ restart:
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
 	} else {
 		/*
 		 * Addition succeeded, update Btree hashvals.
@@ -1470,10 +1440,8 @@ restart:
 			 * and started a new one.  We need the inode to be
 			 * in all transactions.
 			 */
-			if (committed) {
-				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(args->trans, dp);
-			}
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
 		}
 
 		/*
@@ -1604,10 +1572,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
 
 		/*
 		 * Commit the Btree join operation and start a new trans.
@@ -1658,10 +1624,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 			 * and started a new one.  We need the inode to be
 			 * in all transactions.
 			 */
-			if (committed) {
-				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(args->trans, dp);
-			}
+			if (committed)
+				xfs_trans_ijoin(args->trans, dp);
 		} else
 			xfs_da_brelse(args->trans, bp);
 	}
@@ -2004,7 +1968,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  NULL, 0, map, &nmap, NULL, NULL);
+				  NULL, 0, map, &nmap, NULL);
 		if (error)
 			return(error);
 		ASSERT(nmap >= 1);
@@ -2022,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 
 			tmp = (valuelen < XFS_BUF_SIZE(bp))
 				? valuelen : XFS_BUF_SIZE(bp);
-			xfs_biomove(bp, 0, tmp, dst, XBF_READ);
+			xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
 			xfs_buf_relse(bp);
 			dst += tmp;
 			valuelen -= tmp;
@@ -2083,7 +2047,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
 							XFS_BMAPI_WRITE,
 				  args->firstblock, args->total, &map, &nmap,
-				  args->flist, NULL);
+				  args->flist);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						&committed);
@@ -2099,10 +2063,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp);
 
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2098,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				  args->firstblock, 0, &map, &nmap,
-				  NULL, NULL);
+				  NULL);
 		if (error) {
 			return(error);
 		}
@@ -2154,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
-		xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
+		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
-			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+			xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
 		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
 			return (error);
 		}
@@ -2201,7 +2163,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 					args->rmtblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 					args->firstblock, 0, &map, &nmap,
-					args->flist, NULL);
+					args->flist);
 		if (error) {
 			return(error);
 		}
@@ -2239,7 +2201,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				    1, args->firstblock, args->flist,
-				    NULL, &done);
+				    &done);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						&committed);
@@ -2255,10 +2217,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		 * bmap_finish() may have committed the last trans and started
 		 * a new one.  We need the inode to be in all transactions.
 		 */
-		if (committed) {
-			xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(args->trans, args->dp);
-		}
+		if (committed)
+			xfs_trans_ijoin(args->trans, args->dp);
 
 		/*
 		 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc25..a6cff8edcdb 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 		nmap = 1;
 		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					NULL, 0, &map, &nmap, NULL, NULL);
+					NULL, 0, &map, &nmap, NULL);
 		if (error) {
 			return(error);
 		}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 99587ded043..8abd12e32e1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork); /* data or attr fork */
 
 /*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta); /* Change made to incore extents */
+	int			*logflagsp); /* inode logging flags */
 
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	 /* OK to allocate reserved blocks */
 
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to use reserved data blocks */
 {
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
 			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else
 			logflags = 0;
-		/* DELTA: single new extent */
-		if (delta) {
-			if (delta->xed_startoff > new->br_startoff)
-				delta->xed_startoff = new->br_startoff;
-			if (delta->xed_blockcount <
-					new->br_startoff + new->br_blockcount)
-				delta->xed_blockcount = new->br_startoff +
-						new->br_blockcount;
-		}
 	}
 	/*
 	 * Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-				&logflags, delta, rsvd)))
+				&logflags, rsvd)))
 			goto done;
 	}
 	/*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, delta, whichfork)))
+				&logflags, whichfork)))
 			goto done;
 	} else {
 		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
 						XFS_BTCUR_BPRV_WASDEL);
 				if ((error = xfs_bmap_add_extent_delay_real(ip,
 					idx, &cur, new, &da_new, first, flist,
-					&logflags, delta, rsvd)))
+					&logflags, rsvd)))
 					goto done;
 			} else if (new->br_state == XFS_EXT_NORM) {
 				ASSERT(new->br_state == XFS_EXT_NORM);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags, delta)))
+					ip, idx, &cur, new, &logflags)))
 					goto done;
 			} else {
 				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags, delta)))
+					ip, idx, &cur, new, &logflags)))
 					goto done;
 			}
 			ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
 				ASSERT((cur->bc_private.b.flags &
 					XFS_BTCUR_BPRV_WASDEL) == 0);
 			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, delta, whichfork)))
+					new, &logflags, whichfork)))
 				goto done;
 		}
 	}
@@ -633,7 +614,7 @@ xfs_bmap_add_extent(
 			nblks += cur->bc_private.b.allocated;
 		ASSERT(nblks <= da_old);
 		if (nblks < da_old)
-			xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+			xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
 				(int64_t)(da_old - nblks), rsvd);
 	}
 	/*
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)	/* OK to use reserved data block allocation */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
-		/* DELTA: Three in-core extents are replaced by one. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
-		/* DELTA: Two in-core extents are replaced by one. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
-		/* DELTA: Two in-core extents are replaced by one. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		*dnew = 0;
-		/* DELTA: The in-core extent described by new changed type. */
-		temp = new->br_startoff;
-		temp2 = new->br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
-		/* DELTA: The boundary between two in-core extents moved. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
 		*dnew = temp;
-		/* DELTA: One in-core extent is split in two. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
-		/* DELTA: The boundary between two in-core extents moved. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 		*dnew = temp;
-		/* DELTA: One in-core extent is split in two. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1129,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
 		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
-		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
+		    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+					     -((int64_t)diff), rsvd)) {
 			/*
 			 * Ick gross gag me with a spoon.
 			 */
@@ -1139,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
 					temp--;
 					diff--;
 					if (!diff ||
-					    !xfs_mod_incore_sb(ip->i_mount,
-						    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 				if (temp2) {
 					temp2--;
 					diff--;
 					if (!diff ||
-					    !xfs_mod_incore_sb(ip->i_mount,
-						    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+					    !xfs_icsb_modify_counters(ip->i_mount,
+						    XFS_SBS_FDBLOCKS,
+						    -((int64_t)diff), rsvd))
 						break;
 				}
 			}
@@ -1161,9 +1114,6 @@ xfs_bmap_add_extent_delay_real(
 			nullstartblock((int)temp2));
 		trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
 		*dnew = temp + temp2;
-		/* DELTA: One in-core extent is split in three. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1129,6 @@ xfs_bmap_add_extent_delay_real(
 		ASSERT(0);
 	}
 	*curp = cur;
-	if (delta) {
-		temp2 += temp;
-		if (delta->xed_startoff > temp)
-			delta->xed_startoff = temp;
-		if (delta->xed_blockcount < temp2)
-			delta->xed_blockcount = temp2;
-	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1204,8 +1147,7 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta) /* Change made to incore extents */
+	int			*logflagsp) /* inode logging flags */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
 	xfs_bmbt_rec_host_t	*ep;	/* extent entry for idx */
@@ -1219,8 +1161,6 @@ xfs_bmap_add_extent_unwritten_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		temp=0;
-	xfs_filblks_t		temp2=0;
 
 #define	LEFT		r[0]
 #define	RIGHT		r[1]
@@ -1341,11 +1281,6 @@ xfs_bmap_add_extent_unwritten_real(
 				RIGHT.br_blockcount, LEFT.br_state)))
 				goto done;
 		}
-		/* DELTA: Three in-core extents are replaced by one. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1317,6 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state)))
 				goto done;
 		}
-		/* DELTA: Two in-core extents are replaced by one. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1353,6 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
-		/* DELTA: Two in-core extents are replaced by one. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1380,6 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
-		/* DELTA: The in-core extent described by new changed type. */
-		temp = new->br_startoff;
-		temp2 = new->br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1425,6 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state))
 				goto done;
 		}
-		/* DELTA: The boundary between two in-core extents moved. */
-		temp = LEFT.br_startoff;
-		temp2 = LEFT.br_blockcount +
-			PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING:
@@ -1544,9 +1464,6 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		/* DELTA: One in-core extent is split in two. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1504,6 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
-		/* DELTA: The boundary between two in-core extents moved. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount +
-			RIGHT.br_blockcount;
 		break;
 
 	case BMAP_RIGHT_FILLING:
@@ -1630,9 +1543,6 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		/* DELTA: One in-core extent is split in two. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1692,9 +1602,6 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		/* DELTA: One in-core extent is split in three. */
-		temp = PREV.br_startoff;
-		temp2 = PREV.br_blockcount;
 		break;
 
 	case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1617,6 @@ xfs_bmap_add_extent_unwritten_real(
 		ASSERT(0);
 	}
 	*curp = cur;
-	if (delta) {
-		temp2 += temp;
-		if (delta->xed_startoff > temp)
-			delta->xed_startoff = temp;
-		if (delta->xed_blockcount < temp2)
-			delta->xed_blockcount = temp2;
-	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1736,7 +1636,6 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)		/* OK to allocate reserved blocks */
 {
 	xfs_bmbt_rec_host_t	*ep;	/* extent record for idx */
@@ -1747,7 +1646,6 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			state;  /* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
-	xfs_filblks_t		temp2=0;
 
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1717,6 @@ xfs_bmap_add_extent_hole_delay(
 
 		xfs_iext_remove(ip, idx, 1, state);
 		ip->i_df.if_lastex = idx - 1;
-		/* DELTA: Two in-core extents were replaced by one. */
-		temp2 = temp;
-		temp = left.br_startoff;
 		break;
 
 	case BMAP_LEFT_CONTIG:
@@ -1841,9 +1736,6 @@ xfs_bmap_add_extent_hole_delay(
 		trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
 
 		ip->i_df.if_lastex = idx - 1;
-		/* DELTA: One in-core extent grew into a hole. */
-		temp2 = temp;
-		temp = left.br_startoff;
 		break;
 
 	case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1754,6 @@ xfs_bmap_add_extent_hole_delay(
 		trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
 
 		ip->i_df.if_lastex = idx;
-		/* DELTA: One in-core extent grew into a hole. */
-		temp2 = temp;
-		temp = new->br_startoff;
 		break;
 
 	case 0:
@@ -1876,26 +1765,16 @@ xfs_bmap_add_extent_hole_delay(
 		oldlen = newlen = 0;
 		xfs_iext_insert(ip, idx, 1, new, state);
 		ip->i_df.if_lastex = idx;
-		/* DELTA: A new in-core extent was added in a hole. */
-		temp2 = new->br_blockcount;
-		temp = new->br_startoff;
 		break;
 	}
 	if (oldlen != newlen) {
 		ASSERT(oldlen > newlen);
-		xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+		xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
 			(int64_t)(oldlen - newlen), rsvd);
 		/*
 		 * Nothing to do for disk quota accounting here.
 		 */
 	}
-	if (delta) {
-		temp2 += temp;
-		if (delta->xed_startoff > temp)
-			delta->xed_startoff = temp;
-		if (delta->xed_blockcount < temp2)
-			delta->xed_blockcount = temp2;
-	}
 	*logflagsp = 0;
 	return 0;
 }
@@ -1911,7 +1790,6 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork) /* data or attr fork */
 {
 	xfs_bmbt_rec_host_t	*ep;	/* pointer to extent entry ins. point */
@@ -1922,8 +1800,6 @@ xfs_bmap_add_extent_hole_real(
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
-	xfs_filblks_t		temp=0;
-	xfs_filblks_t		temp2=0;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1896,6 @@ xfs_bmap_add_extent_hole_real(
 					left.br_state)))
 				goto done;
 		}
-		/* DELTA: Two in-core extents were replaced by one. */
-		temp = left.br_startoff;
-		temp2 = left.br_blockcount +
-			new->br_blockcount +
-			right.br_blockcount;
 		break;
 
 	case BMAP_LEFT_CONTIG:
@@ -2056,10 +1927,6 @@ xfs_bmap_add_extent_hole_real(
 					left.br_state)))
 				goto done;
 		}
-		/* DELTA: One in-core extent grew. */
-		temp = left.br_startoff;
-		temp2 = left.br_blockcount +
-			new->br_blockcount;
 		break;
 
 	case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1959,6 @@ xfs_bmap_add_extent_hole_real(
 					right.br_state)))
 				goto done;
 		}
-		/* DELTA: One in-core extent grew. */
-		temp = new->br_startoff;
-		temp2 = new->br_blockcount +
-			right.br_blockcount;
 		break;
 
 	case 0:
@@ -2123,18 +1986,8 @@ xfs_bmap_add_extent_hole_real(
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
-		/* DELTA: A new extent was added in a hole. */
-		temp = new->br_startoff;
-		temp2 = new->br_blockcount;
 		break;
 	}
-	if (delta) {
-		temp2 += temp;
-		if (delta->xed_startoff > temp)
-			delta->xed_startoff = temp;
-		if (delta->xed_blockcount < temp2)
-			delta->xed_blockcount = temp2;
-	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -2959,7 +2812,6 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
-	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to allocate reserved blocks */
 {
@@ -3262,16 +3114,9 @@ xfs_bmap_del_extent(
 	 * Nothing to do for disk quota accounting here.
 	 */
 	ASSERT(da_old >= da_new);
-	if (da_old > da_new)
-		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
-			rsvd);
-	if (delta) {
-		/* DELTA: report the original extent. */
-		if (delta->xed_startoff > got.br_startoff)
-			delta->xed_startoff = got.br_startoff;
-		if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
-			delta->xed_blockcount = got.br_startoff +
-							got.br_blockcount;
+	if (da_old > da_new) {
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+			(int64_t)(da_old - da_new), rsvd);
 	}
 done:
 	*logflagsp = flags;
@@ -3754,9 +3599,10 @@ xfs_bmap_add_attrfork(
 		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	}
 	ASSERT(ip->i_d.di_anextents == 0);
-	IHOLD(ip);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_DEV:
 		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -4483,8 +4329,7 @@ xfs_bmapi(
 	xfs_extlen_t	total,		/* total blocks needed */
 	xfs_bmbt_irec_t	*mval,		/* output: map values */
 	int		*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t	*flist,		/* i/o: list extents to free */
-	xfs_extdelta_t	*delta)		/* o: change made to incore extents */
+	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
 {
 	xfs_fsblock_t	abno;		/* allocated block number */
 	xfs_extlen_t	alen;		/* allocated extent length */
@@ -4596,10 +4441,7 @@ xfs_bmapi(
 	end = bno + len;
 	obno = bno;
 	bma.ip = NULL;
-	if (delta) {
-		delta->xed_startoff = NULLFILEOFF;
-		delta->xed_blockcount = 0;
-	}
+
 	while (bno < end && n < *nmap) {
 		/*
 		 * Reading past eof, act as though there's a hole
@@ -4620,19 +4462,13 @@ xfs_bmapi(
 			 * allocate the stuff asked for in this bmap call
 			 * but that wouldn't be as good.
 			 */
-			if (wasdelay && !(flags & XFS_BMAPI_EXACT)) {
+			if (wasdelay) {
 				alen = (xfs_extlen_t)got.br_blockcount;
 				aoff = got.br_startoff;
 				if (lastx != NULLEXTNUM && lastx) {
 					ep = xfs_iext_get_ext(ifp, lastx - 1);
 					xfs_bmbt_get_all(ep, &prev);
 				}
-			} else if (wasdelay) {
-				alen = (xfs_extlen_t)
-					XFS_FILBLKS_MIN(len,
-						(got.br_startoff +
-						 got.br_blockcount) - bno);
-				aoff = bno;
 			} else {
 				alen = (xfs_extlen_t)
 					XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4694,13 +4530,13 @@ xfs_bmapi(
 							-((int64_t)extsz), (flags &
 							XFS_BMAPI_RSVBLOCKS));
 				} else {
-					error = xfs_mod_incore_sb(mp,
+					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							-((int64_t)alen), (flags &
 							XFS_BMAPI_RSVBLOCKS));
 				}
 				if (!error) {
-					error = xfs_mod_incore_sb(mp,
+					error = xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							-((int64_t)indlen), (flags &
 							XFS_BMAPI_RSVBLOCKS));
@@ -4710,7 +4546,7 @@ xfs_bmapi(
 							(int64_t)extsz, (flags &
 							XFS_BMAPI_RSVBLOCKS));
 					else if (error)
-						xfs_mod_incore_sb(mp,
+						xfs_icsb_modify_counters(mp,
 							XFS_SBS_FDBLOCKS,
 							(int64_t)alen, (flags &
 							XFS_BMAPI_RSVBLOCKS));
@@ -4831,7 +4667,7 @@ xfs_bmapi(
 					got.br_state = XFS_EXT_UNWRITTEN;
 			}
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-				firstblock, flist, &tmp_logflags, delta,
+				firstblock, flist, &tmp_logflags,
 				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
@@ -4912,8 +4748,12 @@ xfs_bmapi(
 		 * Check if writing previously allocated but
 		 * unwritten extents.
 		 */
-		if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
-		    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+		if (wr &&
+		    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+		     (mval->br_state == XFS_EXT_NORM &&
+		      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+				(XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
 			/*
 			 * Modify (by adding) the state flag, if writing.
 			 */
@@ -4925,9 +4765,11 @@ xfs_bmapi(
 					*firstblock;
 				cur->bc_private.b.flist = flist;
 			}
-			mval->br_state = XFS_EXT_NORM;
+			mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+						? XFS_EXT_NORM
+						: XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-				firstblock, flist, &tmp_logflags, delta,
+				firstblock, flist, &tmp_logflags,
 				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
@@ -5017,14 +4859,6 @@ xfs_bmapi(
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
 	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
 	error = 0;
-	if (delta && delta->xed_startoff != NULLFILEOFF) {
-		/* A change was actually made.
-		 * Note that delta->xed_blockount is an offset at this
-		 * point and needs to be converted to a block count.
-		 */
-		ASSERT(delta->xed_blockcount > delta->xed_startoff);
-		delta->xed_blockcount -= delta->xed_startoff;
-	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5136,8 +4970,6 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	xfs_extdelta_t		*delta,		/* o: change made to incore
-						   extents */
 	int			*done)		/* set if not done yet */
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
@@ -5196,10 +5028,7 @@ xfs_bunmapi(
 	bno = start + len - 1;
 	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
 		&prev);
-	if (delta) {
-		delta->xed_startoff = NULLFILEOFF;
-		delta->xed_blockcount = 0;
-	}
+
 	/*
 	 * Check to see if the given block number is past the end of the
 	 * file, back up to the last block if so...
@@ -5297,7 +5126,7 @@ xfs_bunmapi(
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-				firstblock, flist, &logflags, delta,
+				firstblock, flist, &logflags,
 				XFS_DATA_FORK, 0);
 			if (error)
 				goto error0;
@@ -5352,7 +5181,7 @@ xfs_bunmapi(
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
 					&prev, firstblock, flist, &logflags,
-					delta, XFS_DATA_FORK, 0);
+					XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5361,7 +5190,7 @@ xfs_bunmapi(
 				del.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx, &cur,
 					&del, firstblock, flist, &logflags,
-					delta, XFS_DATA_FORK, 0);
+					XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5381,7 +5210,7 @@ xfs_bunmapi(
 					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
-				xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+				xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
 						(int64_t)del.br_blockcount, rsvd);
 				(void)xfs_trans_reserve_quota_nblks(NULL,
 					ip, -((long)del.br_blockcount), 0,
@@ -5414,7 +5243,7 @@ xfs_bunmapi(
 			goto error0;
 		}
 		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-				&tmp_logflags, delta, whichfork, rsvd);
+				&tmp_logflags, whichfork, rsvd);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
@@ -5471,14 +5300,6 @@ nodelete:
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
-	if (delta && delta->xed_startoff != NULLFILEOFF) {
-		/* A change was actually made.
-		 * Note that delta->xed_blockount is an offset at this
-		 * point and needs to be converted to a block count.
-		 */
-		ASSERT(delta->xed_blockcount > delta->xed_startoff);
-		delta->xed_blockcount -= delta->xed_startoff;
-	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5605,28 +5426,6 @@ xfs_getbmap(
 		prealloced = 0;
 		fixlen = 1LL << 32;
 	} else {
-		/*
-		 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
-		 * not generate a DMAPI read event.  Otherwise, if the
-		 * DM_EVENT_READ bit is set for the file, generate a read
-		 * event in order that the DMAPI application may do its thing
-		 * before we return the extents.  Usually this means restoring
-		 * user file data to regions of the file that look like holes.
-		 *
-		 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-		 * BMV_IF_NO_DMAPI_READ so that read events are generated.
-		 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
-		 * could misinterpret holes in a DMAPI file as true holes,
-		 * when in fact they may represent offline user data.
-		 */
-		if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-		    !(iflags & BMV_IF_NO_DMAPI_READ)) {
-			error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
-					      0, 0, 0, NULL);
-			if (error)
-				return XFS_ERROR(error);
-		}
-
 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5512,7 @@ xfs_getbmap(
 		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
 				  bmapi_flags, NULL, 0, map, &nmap,
-				  NULL, NULL);
+				  NULL);
 		if (error)
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
@@ -5744,12 +5543,24 @@ xfs_getbmap(
 					map[i].br_startblock))
 				goto out_free_map;
 
-			nexleft--;
 			bmv->bmv_offset =
 				out[cur_ext].bmv_offset +
 				out[cur_ext].bmv_length;
 			bmv->bmv_length =
 				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+			/*
+			 * In case we don't want to return the hole,
+			 * don't increase cur_ext so that we can reuse
+			 * it in the next loop.
+			 */
+			if ((iflags & BMV_IF_NO_HOLES) &&
+			    map[i].br_startblock == HOLESTARTBLOCK) {
+				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+				continue;
+			}
+
+			nexleft--;
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
@@ -5859,66 +5670,34 @@ xfs_bmap_eof(
 }
 
 #ifdef DEBUG
-STATIC
-xfs_buf_t *
+STATIC struct xfs_buf *
 xfs_bmap_get_bp(
-	xfs_btree_cur_t         *cur,
+	struct xfs_btree_cur	*cur,
 	xfs_fsblock_t		bno)
 {
-	int i;
-	xfs_buf_t *bp;
+	struct xfs_log_item_desc *lidp;
+	int			i;
 
 	if (!cur)
-		return(NULL);
-
-	bp = NULL;
-	for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-		bp = cur->bc_bufs[i];
-		if (!bp) break;
-		if (XFS_BUF_ADDR(bp) == bno)
-			break;	/* Found it */
-	}
-	if (i == XFS_BTREE_MAXLEVELS)
-		bp = NULL;
-
-	if (!bp) { /* Chase down all the log items to see if the bp is there */
-		xfs_log_item_chunk_t    *licp;
-		xfs_trans_t		*tp;
-
-		tp = cur->bc_tp;
-		licp = &tp->t_items;
-		while (!bp && licp != NULL) {
-			if (xfs_lic_are_all_free(licp)) {
-				licp = licp->lic_next;
-				continue;
-			}
-			for (i = 0; i < licp->lic_unused; i++) {
-				xfs_log_item_desc_t	*lidp;
-				xfs_log_item_t		*lip;
-				xfs_buf_log_item_t	*bip;
-				xfs_buf_t		*lbp;
-
-				if (xfs_lic_isfree(licp, i)) {
-					continue;
-				}
-
-				lidp = xfs_lic_slot(licp, i);
-				lip = lidp->lid_item;
-				if (lip->li_type != XFS_LI_BUF)
-					continue;
+		return NULL;
 
-				bip = (xfs_buf_log_item_t *)lip;
-				lbp = bip->bli_buf;
+	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		if (!cur->bc_bufs[i])
+			break;
+		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+			return cur->bc_bufs[i];
+	}
 
-				if (XFS_BUF_ADDR(lbp) == bno) {
-					bp = lbp;
-					break; /* Found it */
-				}
-			}
-			licp = licp->lic_next;
-		}
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+		struct xfs_buf_log_item	*bip;
+		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_ADDR(bip->bli_buf) == bno)
+			return bip->bli_buf;
 	}
-	return(bp);
+
+	return NULL;
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87..71ec9b6ecdf 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
 extern kmem_zone_t	*xfs_bmap_free_item_zone;
 
 /*
- * DELTA: describe a change to the in-core extent list.
- *
- * Internally the use of xed_blockount is somewhat funky.
- * xed_blockcount contains an offset much of the time because this
- * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
- * the same underlying type).
- */
-typedef struct xfs_extdelta
-{
-	xfs_fileoff_t		xed_startoff;	/* offset of range */
-	xfs_filblks_t		xed_blockcount;	/* blocks in range */
-} xfs_extdelta_t;
-
-/*
  * List of extents to be free "later".
  * The list is kept sorted on xbf_startblock.
  */
@@ -82,27 +68,25 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
 #define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
-#define XFS_BMAPI_EXACT		0x010	/* allocate only to spec'd bounds */
-#define XFS_BMAPI_ATTRFORK	0x020	/* use attribute fork not data */
-#define XFS_BMAPI_ASYNC		0x040	/* bunmapi xactions can be async */
-#define XFS_BMAPI_RSVBLOCKS	0x080	/* OK to alloc. reserved data blocks */
-#define	XFS_BMAPI_PREALLOC	0x100	/* preallocation op: unwritten space */
-#define	XFS_BMAPI_IGSTATE	0x200	/* Ignore state - */
+#define XFS_BMAPI_ATTRFORK	0x010	/* use attribute fork not data */
+#define XFS_BMAPI_RSVBLOCKS	0x020	/* OK to alloc. reserved data blocks */
+#define	XFS_BMAPI_PREALLOC	0x040	/* preallocation op: unwritten space */
+#define	XFS_BMAPI_IGSTATE	0x080	/* Ignore state - */
 					/* combine contig. space */
-#define	XFS_BMAPI_CONTIG	0x400	/* must allocate only one extent */
-/*	XFS_BMAPI_DIRECT_IO	0x800	*/
-#define XFS_BMAPI_CONVERT	0x1000	/* unwritten extent conversion - */
-					/* need write cache flushing and no */
-					/* additional allocation alignments */
+#define	XFS_BMAPI_CONTIG	0x100	/* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT	0x200
 
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_WRITE,	"WRITE" }, \
 	{ XFS_BMAPI_DELAY,	"DELAY" }, \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
-	{ XFS_BMAPI_EXACT,	"EXACT" }, \
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
-	{ XFS_BMAPI_ASYNC,	"ASYNC" }, \
 	{ XFS_BMAPI_RSVBLOCKS,	"RSVBLOCKS" }, \
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
@@ -310,9 +294,7 @@ xfs_bmapi(
 	xfs_extlen_t		total,		/* total blocks needed */
 	struct xfs_bmbt_irec	*mval,		/* output: map values */
 	int			*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	xfs_extdelta_t		*delta);	/* o: change made to incore
-						   extents */
+	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
 
 /*
  * Map file blocks to filesystem blocks, simple version.
@@ -346,8 +328,6 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	xfs_extdelta_t		*delta,		/* o: change made to incore
-						   extents */
 	int			*done);		/* set if not done yet */
 
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b8..87d3c10b695 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f249..04f9cca8da7 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 
@@ -222,7 +217,7 @@ xfs_btree_del_cursor(
 	 */
 	for (i = 0; i < cur->bc_nlevels; i++) {
 		if (cur->bc_bufs[i])
-			xfs_btree_setbuf(cur, i, NULL);
+			xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 		else if (!error)
 			break;
 	}
@@ -661,7 +656,7 @@ xfs_btree_reada_bufl(
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
 /*
@@ -681,7 +676,7 @@ xfs_btree_reada_bufs(
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
 STATIC int
@@ -768,22 +763,19 @@ xfs_btree_readahead(
  * Set the buffer for level "lev" in the cursor to bp, releasing
  * any previous buffer.
  */
-void
+STATIC void
 xfs_btree_setbuf(
 	xfs_btree_cur_t		*cur,	/* btree cursor */
 	int			lev,	/* level in btree */
 	xfs_buf_t		*bp)	/* new buffer to set */
 {
 	struct xfs_btree_block	*b;	/* btree block */
-	xfs_buf_t		*obp;	/* old buffer pointer */
 
-	obp = cur->bc_bufs[lev];
-	if (obp)
-		xfs_trans_brelse(cur->bc_tp, obp);
+	if (cur->bc_bufs[lev])
+		xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 	cur->bc_bufs[lev] = bp;
 	cur->bc_ra[lev] = 0;
-	if (!bp)
-		return;
+
 	b = XFS_BUF_TO_BLOCK(bp);
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 		if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3016,6 +3008,43 @@ out0:
 	return 0;
 }
 
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
+	int			level,
+	union xfs_btree_ptr	*newroot)
+{
+	int			error;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+	XFS_BTREE_STATS_INC(cur, killroot);
+
+	/*
+	 * Update the root pointer, decreasing the level by 1 and then
+	 * free the old root.
+	 */
+	cur->bc_ops->set_root(cur, newroot, -1);
+
+	error = cur->bc_ops->free_block(cur, bp);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
+	}
+
+	XFS_BTREE_STATS_INC(cur, free);
+
+	cur->bc_bufs[level] = NULL;
+	cur->bc_ra[level] = 0;
+	cur->bc_nlevels--;
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	return 0;
+}
+
 STATIC int
 xfs_btree_dec_cursor(
 	struct xfs_btree_cur	*cur,
@@ -3200,7 +3229,7 @@ xfs_btree_delrec(
 			 * Make it the new root of the btree.
 			 */
 			pp = xfs_btree_ptr_addr(cur, 1, block);
-			error = cur->bc_ops->kill_root(cur, bp, level, pp);
+			error = xfs_btree_kill_root(cur, bp, level, pp);
 			if (error)
 				goto error0;
 		} else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdd..82fafc66bd1 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
 
 	/* update btree root pointer */
 	void	(*set_root)(struct xfs_btree_cur *cur,
-				union xfs_btree_ptr *nptr, int level_change);
-	int	(*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
-				int level, union xfs_btree_ptr *newroot);
+			    union xfs_btree_ptr *nptr, int level_change);
 
 	/* block allocation / freeing */
 	int	(*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
 	xfs_agblock_t		agbno,	/* allocation group block number */
 	xfs_extlen_t		count);	/* count of filesystem blocks */
 
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			lev,	/* level in btree */
-	struct xfs_buf		*bp);	/* new buffer to set */
-
 
 /*
  * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 02a80984aa0..2686d0d54c5 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
 
 kmem_zone_t	*xfs_buf_item_zone;
 
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+
 #ifdef XFS_TRANS_DEBUG
 /*
  * This function uses an alternate strategy for tracking the bytes
@@ -151,12 +156,13 @@ STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
  */
 STATIC uint
 xfs_buf_item_size(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	uint		nvecs;
-	int		next_bit;
-	int		last_bit;
-	xfs_buf_t	*bp;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	uint			nvecs;
+	int			next_bit;
+	int			last_bit;
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	if (bip->bli_flags & XFS_BLI_STALE) {
@@ -170,7 +176,6 @@ xfs_buf_item_size(
 		return 1;
 	}
 
-	bp = bip->bli_buf;
 	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 	nvecs = 1;
 	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -219,13 +224,13 @@ xfs_buf_item_size(
  */
 STATIC void
 xfs_buf_item_format(
-	xfs_buf_log_item_t	*bip,
-	xfs_log_iovec_t		*log_vector)
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*vecp)
 {
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf	*bp = bip->bli_buf;
 	uint		base_size;
 	uint		nvecs;
-	xfs_log_iovec_t	*vecp;
-	xfs_buf_t	*bp;
 	int		first_bit;
 	int		last_bit;
 	int		next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
-	bp = bip->bli_buf;
-	vecp = log_vector;
 
 	/*
 	 * The size of the base structure is the size of the
@@ -248,7 +251,7 @@ xfs_buf_item_format(
 	base_size =
 		(uint)(sizeof(xfs_buf_log_format_t) +
 		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
-	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+	vecp->i_addr = &bip->bli_format;
 	vecp->i_len = base_size;
 	vecp->i_type = XLOG_REG_TYPE_BFORMAT;
 	vecp++;
@@ -263,7 +266,7 @@ xfs_buf_item_format(
 	 */
 	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
 		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
-		      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+		      xfs_log_item_in_current_chkpt(lip)))
 			bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
 		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 	}
@@ -356,66 +359,90 @@ xfs_buf_item_format(
 
 /*
  * This is called to pin the buffer associated with the buf log item in memory
- * so it cannot be written out.  Simply call bpin() on the buffer to do this.
+ * so it cannot be written out.
  *
  * We also always take a reference to the buffer log item here so that the bli
  * is held while the item is pinned in memory. This means that we can
  * unconditionally drop the reference count a transaction holds when the
  * transaction is completed.
  */
-
 STATIC void
 xfs_buf_item_pin(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 
-	bp = bip->bli_buf;
-	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
-	atomic_inc(&bip->bli_refcount);
+
 	trace_xfs_buf_item_pin(bip);
-	xfs_bpin(bp);
-}
 
+	atomic_inc(&bip->bli_refcount);
+	atomic_inc(&bip->bli_buf->b_pin_count);
+}
 
 /*
  * This is called to unpin the buffer associated with the buf log
  * item which was previously pinned with a call to xfs_buf_item_pin().
- * Just call bunpin() on the buffer to do this.
  *
  * Also drop the reference to the buf item for the current transaction.
  * If the XFS_BLI_STALE flag is set and we are the last reference,
  * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path.  If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
  */
 STATIC void
 xfs_buf_item_unpin(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	struct xfs_ail	*ailp;
-	xfs_buf_t	*bp;
-	int		freed;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	xfs_buf_t	*bp = bip->bli_buf;
+	struct xfs_ail	*ailp = lip->li_ailp;
 	int		stale = bip->bli_flags & XFS_BLI_STALE;
+	int		freed;
 
-	bp = bip->bli_buf;
-	ASSERT(bp != NULL);
 	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
 	trace_xfs_buf_item_unpin(bip);
 
 	freed = atomic_dec_and_test(&bip->bli_refcount);
-	ailp = bip->bli_item.li_ailp;
-	xfs_bunpin(bp);
+
+	if (atomic_dec_and_test(&bp->b_pin_count))
+		wake_up_all(&bp->b_waiters);
+
 	if (freed && stale) {
 		ASSERT(bip->bli_flags & XFS_BLI_STALE);
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+
 		trace_xfs_buf_item_unpin_stale(bip);
 
+		if (remove) {
+			/*
+			 * We have to remove the log item from the transaction
+			 * as we are about to release our reference to the
+			 * buffer.  If we don't, the unlock that occurs later
+			 * in xfs_trans_uncommit() will ry to reference the
+			 * buffer which we no longer have a hold on.
+			 */
+			xfs_trans_del_item(lip);
+
+			/*
+			 * Since the transaction no longer refers to the buffer,
+			 * the buffer should no longer refer to the transaction.
+			 */
+			XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+		}
+
 		/*
 		 * If we get called here because of an IO error, we may
 		 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -437,48 +464,6 @@ xfs_buf_item_unpin(
 }
 
 /*
- * this is called from uncommit in the forced-shutdown path.
- * we need to check to see if the reference count on the log item
- * is going to drop to zero.  If so, unpin will free the log item
- * so we need to free the item's descriptor (that points to the item)
- * in the transaction.
- */
-STATIC void
-xfs_buf_item_unpin_remove(
-	xfs_buf_log_item_t	*bip,
-	xfs_trans_t		*tp)
-{
-	/* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
-	if ((atomic_read(&bip->bli_refcount) == 1) &&
-	    (bip->bli_flags & XFS_BLI_STALE)) {
-		/*
-		 * yes -- We can safely do some work here and then call
-		 * buf_item_unpin to do the rest because we are
-		 * are holding the buffer locked so no one else will be
-		 * able to bump up the refcount. We have to remove the
-		 * log item from the transaction as we are about to release
-		 * our reference to the buffer. If we don't, the unlock that
-		 * occurs later in the xfs_trans_uncommit() will try to
-		 * reference the buffer which we no longer have a hold on.
-		 */
-		struct xfs_log_item_desc *lidp;
-
-		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
-		trace_xfs_buf_item_unpin_stale(bip);
-
-		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
-		xfs_trans_free_item(tp, lidp);
-
-		/*
-		 * Since the transaction no longer refers to the buffer, the
-		 * buffer should no longer refer to the transaction.
-		 */
-		XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
-	}
-	xfs_buf_item_unpin(bip);
-}
-
-/*
  * This is called to attempt to lock the buffer associated with this
  * buf log item.  Don't sleep on the buffer lock.  If we can't get
  * the lock right away, return 0.  If we can get the lock, take a
@@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove(
  */
 STATIC uint
 xfs_buf_item_trylock(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
 
-	bp = bip->bli_buf;
 	if (XFS_BUF_ISPINNED(bp))
 		return XFS_ITEM_PINNED;
 	if (!XFS_BUF_CPSEMA(bp))
@@ -529,13 +514,12 @@ xfs_buf_item_trylock(
  */
 STATIC void
 xfs_buf_item_unlock(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	int		aborted;
-	xfs_buf_t	*bp;
-	uint		hold;
-
-	bp = bip->bli_buf;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
+	int			aborted;
+	uint			hold;
 
 	/* Clear the buffer's association with this transaction. */
 	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
@@ -546,7 +530,7 @@ xfs_buf_item_unlock(
 	 * (cancelled) buffers at unpin time, but we'll never go through the
 	 * pin/unpin cycle if we abort inside commit.
 	 */
-	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+	aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
 
 	/*
 	 * Before possibly freeing the buf item, determine if we should
@@ -607,16 +591,16 @@ xfs_buf_item_unlock(
  */
 STATIC xfs_lsn_t
 xfs_buf_item_committed(
-	xfs_buf_log_item_t	*bip,
+	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+
 	trace_xfs_buf_item_committed(bip);
 
-	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
-	    (bip->bli_item.li_lsn != 0)) {
-		return bip->bli_item.li_lsn;
-	}
-	return (lsn);
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+		return lip->li_lsn;
+	return lsn;
 }
 
 /*
@@ -626,15 +610,16 @@ xfs_buf_item_committed(
  */
 STATIC void
 xfs_buf_item_push(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
 	trace_xfs_buf_item_push(bip);
 
-	bp = bip->bli_buf;
-	ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
 	xfs_buf_relse(bp);
 }
 
@@ -646,22 +631,24 @@ xfs_buf_item_push(
  */
 STATIC void
 xfs_buf_item_pushbuf(
-	xfs_buf_log_item_t	*bip)
+	struct xfs_log_item	*lip)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+	struct xfs_buf		*bp = bip->bli_buf;
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+
 	trace_xfs_buf_item_pushbuf(bip);
 
-	bp = bip->bli_buf;
-	ASSERT(XFS_BUF_ISDELAYWRITE(bp));
 	xfs_buf_delwri_promote(bp);
 	xfs_buf_relse(bp);
 }
 
-/* ARGSUSED */
 STATIC void
-xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+xfs_buf_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn)
 {
 }
 
@@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
  * This is the ops vector shared by all buf log items.
  */
 static struct xfs_item_ops xfs_buf_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_buf_item_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
-					xfs_buf_item_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_buf_item_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_buf_item_committing
+	.iop_size	= xfs_buf_item_size,
+	.iop_format	= xfs_buf_item_format,
+	.iop_pin	= xfs_buf_item_pin,
+	.iop_unpin	= xfs_buf_item_unpin,
+	.iop_trylock	= xfs_buf_item_trylock,
+	.iop_unlock	= xfs_buf_item_unlock,
+	.iop_committed	= xfs_buf_item_committed,
+	.iop_push	= xfs_buf_item_push,
+	.iop_pushbuf	= xfs_buf_item_pushbuf,
+	.iop_committing = xfs_buf_item_committing
 };
 
 
@@ -710,9 +692,7 @@ xfs_buf_item_init(
 	 * the first.  If we do already have one, there is
 	 * nothing to do here so return.
 	 */
-	if (bp->b_mount != mp)
-		bp->b_mount = mp;
-	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+	ASSERT(bp->b_target->bt_mount == mp);
 	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 		if (lip->li_type == XFS_LI_BUF) {
@@ -993,7 +973,7 @@ xfs_buf_iodone_callbacks(
 			xfs_buf_do_callbacks(bp, lip);
 			XFS_BUF_SET_FSPRIVATE(bp, NULL);
 			XFS_BUF_CLR_IODONE_FUNC(bp);
-			xfs_biodone(bp);
+			xfs_buf_ioend(bp, 0);
 			return;
 		}
 
@@ -1052,7 +1032,7 @@ xfs_buf_iodone_callbacks(
 	xfs_buf_do_callbacks(bp, lip);
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
 /*
@@ -1098,15 +1078,14 @@ xfs_buf_error_relse(
  * It is called by xfs_buf_iodone_callbacks() above which will take
  * care of cleaning up the buffer itself.
  */
-/* ARGSUSED */
 void
 xfs_buf_iodone(
-	xfs_buf_t		*bp,
-	xfs_buf_log_item_t	*bip)
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
 {
-	struct xfs_ail		*ailp = bip->bli_item.li_ailp;
+	struct xfs_ail		*ailp = lip->li_ailp;
 
-	ASSERT(bip->bli_buf == bp);
+	ASSERT(BUF_ITEM(lip)->bli_buf == bp);
 
 	xfs_buf_rele(bp);
 
@@ -1120,6 +1099,6 @@ xfs_buf_iodone(
 	 * Either way, AIL is useless if we're forcing a shutdown.
 	 */
 	spin_lock(&ailp->xa_lock);
-	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
-	xfs_buf_item_free(bip);
+	xfs_trans_ail_delete(ailp, lip);
+	xfs_buf_item_free(BUF_ITEM(lip));
 }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f20bb472d58..0e2ed43f16c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -124,7 +124,7 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
 			      xfs_log_item_t *);
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
-void	xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 
 #ifdef XFS_TRANS_DEBUG
 void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf3..1c00bedb317 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	xfs_da_intnode_t *node;
 	xfs_da_node_entry_t *btree;
 	int tmp;
-	xfs_mount_t *mp;
 
 	node = oldblk->bp->data;
-	mp = state->mp;
 	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
 	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
 	ASSERT(newblk->blkno != 0);
 	if (state->args->whichfork == XFS_DATA_FORK)
-		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
-		       newblk->blkno < mp->m_dirfreeblk);
+		ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
+		       newblk->blkno < state->mp->m_dirfreeblk);
 
 	/*
 	 * We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
 			XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist, NULL))) {
+			args->flist))) {
 		return error;
 	}
 	ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
 					XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist,
-					NULL))) {
+					&mapp[mapi], &nmap, args->flist))) {
 				kmem_free(mapp);
 				return error;
 			}
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 		 */
 		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
 				xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-				0, args->firstblock, args->flist, NULL,
+				0, args->firstblock, args->flist,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
 				break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
 					nfsb,
 					XFS_BMAPI_METADATA |
 						xfs_bmapi_aflag(whichfork),
-					NULL, 0, mapp, &nmap, NULL, NULL)))
+					NULL, 0, mapp, &nmap, NULL)))
 				goto exit0;
 		}
 	} else {
@@ -2050,7 +2042,7 @@ xfs_da_do_buf(
 				mappedbno, nmapped, 0, &bp);
 			break;
 		case 3:
-			xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+			xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
 			error = 0;
 			bp = NULL;
 			break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7f159d2a429..3b9582c60a2 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dfrag.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
@@ -425,11 +416,8 @@ xfs_swap_extents(
 	}
 
 
-	IHOLD(ip);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-	IHOLD(tip);
-	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	xfs_trans_log_inode(tp, ip,  ilf_fields);
 	xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a..dffba9ba0db 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
 	__be32		di_uid;		/* owner's user id */
 	__be32		di_gid;		/* owner's group id */
 	__be32		di_nlink;	/* number of links to file */
-	__be16		di_projid;	/* owner's project id */
-	__u8		di_pad[8];	/* unused, zeroed space */
+	__be16		di_projid_lo;	/* lower part of owner's project id */
+	__be16		di_projid_hi;	/* higher part owner's project id */
+	__u8		di_pad[6];	/* unused, zeroed space */
 	__be16		di_flushiter;	/* incremented on flush */
 	xfs_timestamp_t	di_atime;	/* time last accessed */
 	xfs_timestamp_t	di_mtime;	/* time last modified */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f04126..a1321bc7f19 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
 	int		rval;		/* return value */
 	int		v;		/* type-checking value */
 
-	xfs_itrace_entry(dp);
+	trace_xfs_readdir(dp);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
 	if ((error = xfs_bmapi(tp, dp, bno, count,
 			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist, NULL)))
+			args->flist)))
 		return error;
 	ASSERT(nmap <= 1);
 	if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
 			if ((error = xfs_bmapi(tp, dp, b, c,
 					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist,
-					NULL))) {
+					&mapp[mapi], &nmap, args->flist))) {
 				kmem_free(mapp);
 				return error;
 			}
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
 	 */
 	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
 			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-			NULL, &done))) {
+			&done))) {
 		/*
 		 * ENOSPC actually can happen if we're in a removename with
 		 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a8..580d99cef9e 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
 	 */
 
 	buf_len = dp->i_df.if_bytes;
-	buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+	buf = kmem_alloc(buf_len, KM_SLEEP);
 
-	memcpy(buf, sfp, dp->i_df.if_bytes);
-	xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+	memcpy(buf, sfp, buf_len);
+	xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
 	dp->i_d.di_size = 0;
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 	/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d69433..921595b84f5 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9..ae891223be9 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
 					xfs_dir2_byte_to_da(mp,
 						XFS_DIR2_LEAF_OFFSET) - map_off,
 					XFS_BMAPI_METADATA, NULL, 0,
-					&map[map_valid], &nmap, NULL, NULL);
+					&map[map_valid], &nmap, NULL);
 				/*
 				 * Don't know if we should ignore this or
 				 * try to return an error.
@@ -963,7 +961,7 @@ xfs_dir2_leaf_getdents(
 				if (i > ra_current &&
 				    map[ra_index].br_blockcount >=
 				    mp->m_dirblkfsbs) {
-					xfs_baread(mp->m_ddev_targp,
+					xfs_buf_readahead(mp->m_ddev_targp,
 						XFS_FSB_TO_DADDR(mp,
 						   map[ra_index].br_startblock +
 						   ra_offset),
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae75..f9a0864b696 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463..b1bae6b1eed 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd7237..00000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_H__
-#define __XFS_DMAPI_H__
-
-/*	Values used to define the on-disk version of dm_attrname_t. All
- *	on-disk attribute names start with the 8-byte string "SGI_DMI_".
- *
- *      In the on-disk inode, DMAPI attribute names consist of the user-provided
- *      name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
- *      changed.
- */
-
-#define DMATTR_PREFIXLEN	8
-#define DMATTR_PREFIXSTRING	"SGI_DMI_"
-
-typedef enum {
-	DM_EVENT_INVALID	= -1,
-	DM_EVENT_CANCEL		= 0,		/* not supported */
-	DM_EVENT_MOUNT		= 1,
-	DM_EVENT_PREUNMOUNT	= 2,
-	DM_EVENT_UNMOUNT	= 3,
-	DM_EVENT_DEBUT		= 4,		/* not supported */
-	DM_EVENT_CREATE		= 5,
-	DM_EVENT_CLOSE		= 6,		/* not supported */
-	DM_EVENT_POSTCREATE	= 7,
-	DM_EVENT_REMOVE		= 8,
-	DM_EVENT_POSTREMOVE	= 9,
-	DM_EVENT_RENAME		= 10,
-	DM_EVENT_POSTRENAME	= 11,
-	DM_EVENT_LINK		= 12,
-	DM_EVENT_POSTLINK	= 13,
-	DM_EVENT_SYMLINK	= 14,
-	DM_EVENT_POSTSYMLINK	= 15,
-	DM_EVENT_READ		= 16,
-	DM_EVENT_WRITE		= 17,
-	DM_EVENT_TRUNCATE	= 18,
-	DM_EVENT_ATTRIBUTE	= 19,
-	DM_EVENT_DESTROY	= 20,
-	DM_EVENT_NOSPACE	= 21,
-	DM_EVENT_USER		= 22,
-	DM_EVENT_MAX		= 23
-} dm_eventtype_t;
-#define HAVE_DM_EVENTTYPE_T
-
-typedef enum {
-	DM_RIGHT_NULL,
-	DM_RIGHT_SHARED,
-	DM_RIGHT_EXCL
-} dm_right_t;
-#define HAVE_DM_RIGHT_T
-
-/* Defines for determining if an event message should be sent. */
-#ifdef HAVE_DMAPI
-#define	DM_EVENT_ENABLED(ip, event) ( \
-	unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
-		( ((ip)->i_d.di_dmevmask & (1 << event)) || \
-		  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
-	)
-#else
-#define DM_EVENT_ENABLED(ip, event)	(0)
-#endif
-
-#define DM_XFS_VALID_FS_EVENTS		( \
-	(1 << DM_EVENT_PREUNMOUNT)	| \
-	(1 << DM_EVENT_UNMOUNT)		| \
-	(1 << DM_EVENT_NOSPACE)		| \
-	(1 << DM_EVENT_DEBUT)		| \
-	(1 << DM_EVENT_CREATE)		| \
-	(1 << DM_EVENT_POSTCREATE)	| \
-	(1 << DM_EVENT_REMOVE)		| \
-	(1 << DM_EVENT_POSTREMOVE)	| \
-	(1 << DM_EVENT_RENAME)		| \
-	(1 << DM_EVENT_POSTRENAME)	| \
-	(1 << DM_EVENT_LINK)		| \
-	(1 << DM_EVENT_POSTLINK)	| \
-	(1 << DM_EVENT_SYMLINK)		| \
-	(1 << DM_EVENT_POSTSYMLINK)	| \
-	(1 << DM_EVENT_ATTRIBUTE)	| \
-	(1 << DM_EVENT_DESTROY)		)
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a regular file or a symlink.  These events are persistent.
-*/
-
-#define	DM_XFS_VALID_FILE_EVENTS	( \
-	(1 << DM_EVENT_ATTRIBUTE)	| \
-	(1 << DM_EVENT_DESTROY)		)
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a directory.  These events are persistent.
-*/
-
-#define	DM_XFS_VALID_DIRECTORY_EVENTS	( \
-	(1 << DM_EVENT_CREATE)		| \
-	(1 << DM_EVENT_POSTCREATE)	| \
-	(1 << DM_EVENT_REMOVE)		| \
-	(1 << DM_EVENT_POSTREMOVE)	| \
-	(1 << DM_EVENT_RENAME)		| \
-	(1 << DM_EVENT_POSTRENAME)	| \
-	(1 << DM_EVENT_LINK)		| \
-	(1 << DM_EVENT_POSTLINK)	| \
-	(1 << DM_EVENT_SYMLINK)		| \
-	(1 << DM_EVENT_POSTSYMLINK)	| \
-	(1 << DM_EVENT_ATTRIBUTE)	| \
-	(1 << DM_EVENT_DESTROY)		)
-
-/* Events supported by the XFS filesystem. */
-#define	DM_XFS_SUPPORTED_EVENTS		( \
-	(1 << DM_EVENT_MOUNT)		| \
-	(1 << DM_EVENT_PREUNMOUNT)	| \
-	(1 << DM_EVENT_UNMOUNT)		| \
-	(1 << DM_EVENT_NOSPACE)		| \
-	(1 << DM_EVENT_CREATE)		| \
-	(1 << DM_EVENT_POSTCREATE)	| \
-	(1 << DM_EVENT_REMOVE)		| \
-	(1 << DM_EVENT_POSTREMOVE)	| \
-	(1 << DM_EVENT_RENAME)		| \
-	(1 << DM_EVENT_POSTRENAME)	| \
-	(1 << DM_EVENT_LINK)		| \
-	(1 << DM_EVENT_POSTLINK)	| \
-	(1 << DM_EVENT_SYMLINK)		| \
-	(1 << DM_EVENT_POSTSYMLINK)	| \
-	(1 << DM_EVENT_READ)		| \
-	(1 << DM_EVENT_WRITE)		| \
-	(1 << DM_EVENT_TRUNCATE)	| \
-	(1 << DM_EVENT_ATTRIBUTE)	| \
-	(1 << DM_EVENT_DESTROY)		)
-
-
-/*
- *	Definitions used for the flags field on dm_send_*_event().
- */
-
-#define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
-#define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
-#define DM_FLAGS_IMUX		0x004	/* thread holds i_mutex */
-#define DM_FLAGS_IALLOCSEM_RD	0x010	/* thread holds i_alloc_sem rd */
-#define DM_FLAGS_IALLOCSEM_WR	0x020	/* thread holds i_alloc_sem wr */
-
-/*
- *	Pull in platform specific event flags defines
- */
-#include "xfs_dmapi_priv.h"
-
-/*
- *	Macros to turn caller specified delay/block flags into
- *	dm_send_xxxx_event flag DM_FLAGS_NDELAY.
- */
-
-#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
-			DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-
-#endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c..00000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-
-
-static struct xfs_dmops xfs_dmcore_stub = {
-	.xfs_send_data		= (xfs_send_data_t)fs_nosys,
-	.xfs_send_mmap		= (xfs_send_mmap_t)fs_noerr,
-	.xfs_send_destroy	= (xfs_send_destroy_t)fs_nosys,
-	.xfs_send_namesp	= (xfs_send_namesp_t)fs_nosys,
-	.xfs_send_mount		= (xfs_send_mount_t)fs_nosys,
-	.xfs_send_unmount	= (xfs_send_unmount_t)fs_noerr,
-};
-
-int
-xfs_dmops_get(struct xfs_mount *mp)
-{
-	if (mp->m_flags & XFS_MOUNT_DMAPI) {
-		cmn_err(CE_WARN,
-			"XFS: dmapi support not available in this kernel.");
-		return EINVAL;
-	}
-
-	mp->m_dm_ops = &xfs_dmcore_stub;
-	return 0;
-}
-
-void
-xfs_dmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 047b8a8e5c2..ed999026766 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_utils.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 409fe81585f..a55e687bf56 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
 kmem_zone_t	*xfs_efi_zone;
 kmem_zone_t	*xfs_efd_zone;
 
-STATIC void	xfs_efi_item_unlock(xfs_efi_log_item_t *);
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
 
 void
-xfs_efi_item_free(xfs_efi_log_item_t *efip)
+xfs_efi_item_free(
+	struct xfs_efi_log_item	*efip)
 {
-	int nexts = efip->efi_format.efi_nextents;
-
-	if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
 		kmem_free(efip);
-	} else {
+	else
 		kmem_zone_free(xfs_efi_zone, efip);
-	}
 }
 
 /*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
  * structure.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_size(xfs_efi_log_item_t *efip)
+xfs_efi_item_size(
+	struct xfs_log_item	*lip)
 {
 	return 1;
 }
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
  * slots in the efi item have been filled.
  */
 STATIC void
-xfs_efi_item_format(xfs_efi_log_item_t	*efip,
-		    xfs_log_iovec_t	*log_vector)
+xfs_efi_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
 {
-	uint	size;
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	uint			size;
 
 	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
 
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 	size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
 	efip->efi_format.efi_size = 1;
 
-	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+	log_vector->i_addr = &efip->efi_format;
 	log_vector->i_len = size;
 	log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
 	ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 /*
  * Pinning has no meaning for an efi item, so just return.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+xfs_efi_item_pin(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
-
 /*
  * While EFIs cannot really be pinned, the unpin operation is the
  * last place at which the EFI is manipulated during a transaction.
  * Here we coordinate with xfs_efi_cancel() to determine who gets to
  * free the EFI.
  */
-/*ARGSUSED*/
-STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
-{
-	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
-
-	spin_lock(&ailp->xa_lock);
-	if (efip->efi_flags & XFS_EFI_CANCELED) {
-		/* xfs_trans_ail_delete() drops the AIL lock. */
-		xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-		xfs_efi_item_free(efip);
-	} else {
-		efip->efi_flags |= XFS_EFI_COMMITTED;
-		spin_unlock(&ailp->xa_lock);
-	}
-}
-
-/*
- * like unpin only we have to also clear the xaction descriptor
- * pointing the log item if we free the item.  This routine duplicates
- * unpin because efi_flags is protected by the AIL lock.  Freeing
- * the descriptor and then calling unpin would force us to drop the AIL
- * lock which would open up a race condition.
- */
 STATIC void
-xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+xfs_efi_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
+	struct xfs_ail		*ailp = lip->li_ailp;
 
 	spin_lock(&ailp->xa_lock);
 	if (efip->efi_flags & XFS_EFI_CANCELED) {
-		/*
-		 * free the xaction descriptor pointing to this item
-		 */
-		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
-		xfs_trans_free_item(tp, lidp);
+		if (remove)
+			xfs_trans_del_item(lip);
 
 		/* xfs_trans_ail_delete() drops the AIL lock. */
-		xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
+		xfs_trans_ail_delete(ailp, lip);
 		xfs_efi_item_free(efip);
 	} else {
 		efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
  * XFS_ITEM_PINNED so that the caller will eventually flush the log.
  * This should help in getting the EFI out of the AIL.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+xfs_efi_item_trylock(
+	struct xfs_log_item	*lip)
 {
 	return XFS_ITEM_PINNED;
 }
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
 /*
  * Efi items have no locking, so just return.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+xfs_efi_item_unlock(
+	struct xfs_log_item	*lip)
 {
-	if (efip->efi_item.li_flags & XFS_LI_ABORTED)
-		xfs_efi_item_free(efip);
-	return;
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efi_item_free(EFI_ITEM(lip));
 }
 
 /*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
  * flag is not paid any attention here.  Checking for that is delayed
  * until the EFI is unpinned.
  */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
 	return lsn;
 }
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
  * stuck waiting for all of its corresponding efd items to be
  * committed to disk.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_push(xfs_efi_log_item_t *efip)
+xfs_efi_item_push(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
 /*
@@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
  * example, for inodes, the inode is locked throughout the extent freeing
  * so the dependency should be recorded there.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
-	return;
 }
 
 /*
  * This is the ops vector shared by all efi log items.
  */
 static struct xfs_item_ops xfs_efi_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_efi_item_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
-					xfs_efi_item_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_efi_item_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efi_item_push,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_efi_item_committing
+	.iop_size	= xfs_efi_item_size,
+	.iop_format	= xfs_efi_item_format,
+	.iop_pin	= xfs_efi_item_pin,
+	.iop_unpin	= xfs_efi_item_unpin,
+	.iop_trylock	= xfs_efi_item_trylock,
+	.iop_unlock	= xfs_efi_item_unlock,
+	.iop_committed	= xfs_efi_item_committed,
+	.iop_push	= xfs_efi_item_push,
+	.iop_committing = xfs_efi_item_committing
 };
 
 
 /*
  * Allocate and initialize an efi item with the given number of extents.
  */
-xfs_efi_log_item_t *
-xfs_efi_init(xfs_mount_t	*mp,
-	     uint		nextents)
+struct xfs_efi_log_item *
+xfs_efi_init(
+	struct xfs_mount	*mp,
+	uint			nextents)
 
 {
-	xfs_efi_log_item_t	*efip;
+	struct xfs_efi_log_item	*efip;
 	uint			size;
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
 		size = (uint)(sizeof(xfs_efi_log_item_t) +
 			((nextents - 1) * sizeof(xfs_extent_t)));
-		efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+		efip = kmem_zalloc(size, KM_SLEEP);
 	} else {
-		efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
-							     KM_SLEEP);
+		efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
 	}
 
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
 
-	return (efip);
+	return efip;
 }
 
 /*
@@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t	*mp,
 int
 xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 {
-	xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+	xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
 	uint i;
 	uint len = sizeof(xfs_efi_log_format_t) + 
 		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
@@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
 		return 0;
 	} else if (buf->i_len == len32) {
-		xfs_efi_log_format_32_t *src_efi_fmt_32 =
-			(xfs_efi_log_format_32_t *)buf->i_addr;
+		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
 
 		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
@@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 		}
 		return 0;
 	} else if (buf->i_len == len64) {
-		xfs_efi_log_format_64_t *src_efi_fmt_64 =
-			(xfs_efi_log_format_64_t *)buf->i_addr;
+		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
 
 		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
@@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
 	}
 }
 
-STATIC void
-xfs_efd_item_free(xfs_efd_log_item_t *efdp)
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 {
-	int nexts = efdp->efd_format.efd_nextents;
+	return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
 
-	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
 		kmem_free(efdp);
-	} else {
+	else
 		kmem_zone_free(xfs_efd_zone, efdp);
-	}
 }
 
 /*
@@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
  * We only need 1 iovec for an efd item.  It just logs the efd_log_format
  * structure.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+xfs_efd_item_size(
+	struct xfs_log_item	*lip)
 {
 	return 1;
 }
@@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
  * slots in the efd item have been filled.
  */
 STATIC void
-xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
-		    xfs_log_iovec_t	*log_vector)
+xfs_efd_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
 {
-	uint	size;
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+	uint			size;
 
 	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
 
@@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
 	size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
 	efdp->efd_format.efd_size = 1;
 
-	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+	log_vector->i_addr = &efdp->efd_format;
 	log_vector->i_len = size;
 	log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
 	ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 
-
 /*
  * Pinning has no meaning for an efd item, so just return.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+xfs_efd_item_pin(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
-
 /*
  * Since pinning has no meaning for an efd item, unpinning does
  * not either.
  */
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
-{
-	return;
-}
-
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+xfs_efd_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	return;
 }
 
 /*
  * Efd items have no locking, so just return success.
  */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_trylock(
+	struct xfs_log_item	*lip)
 {
 	return XFS_ITEM_LOCKED;
 }
@@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
  * Efd items have no locking or pushing, so return failure
  * so that the caller doesn't bother with us.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_unlock(
+	struct xfs_log_item	*lip)
 {
-	if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
-		xfs_efd_item_free(efdp);
-	return;
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_efd_item_free(EFD_ITEM(lip));
 }
 
 /*
@@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
  * return -1 to keep the transaction code from further referencing
  * this item.
  */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+xfs_efd_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
+	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
+
 	/*
 	 * If we got a log I/O error, it's always the case that the LR with the
 	 * EFI got unpinned and freed before the EFD got aborted.
 	 */
-	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+	if (!(lip->li_flags & XFS_LI_ABORTED))
 		xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
 
 	xfs_efd_item_free(efdp);
@@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
  * There isn't much you can do to push on an efd item.  It is simply
  * stuck waiting for the log to be flushed to disk.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+xfs_efd_item_push(
+	struct xfs_log_item	*lip)
 {
-	return;
 }
 
 /*
@@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
  * example, for inodes, the inode is locked throughout the extent freeing
  * so the dependency should be recorded there.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efd_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
 {
-	return;
 }
 
 /*
  * This is the ops vector shared by all efd log items.
  */
 static struct xfs_item_ops xfs_efd_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_efd_item_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
-					xfs_efd_item_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_efd_item_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efd_item_push,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_efd_item_committing
+	.iop_size	= xfs_efd_item_size,
+	.iop_format	= xfs_efd_item_format,
+	.iop_pin	= xfs_efd_item_pin,
+	.iop_unpin	= xfs_efd_item_unpin,
+	.iop_trylock	= xfs_efd_item_trylock,
+	.iop_unlock	= xfs_efd_item_unlock,
+	.iop_committed	= xfs_efd_item_committed,
+	.iop_push	= xfs_efd_item_push,
+	.iop_committing = xfs_efd_item_committing
 };
 
-
 /*
  * Allocate and initialize an efd item with the given number of extents.
  */
-xfs_efd_log_item_t *
-xfs_efd_init(xfs_mount_t	*mp,
-	     xfs_efi_log_item_t	*efip,
-	     uint		nextents)
+struct xfs_efd_log_item *
+xfs_efd_init(
+	struct xfs_mount	*mp,
+	struct xfs_efi_log_item	*efip,
+	uint			nextents)
 
 {
-	xfs_efd_log_item_t	*efdp;
+	struct xfs_efd_log_item	*efdp;
 	uint			size;
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		size = (uint)(sizeof(xfs_efd_log_item_t) +
 			((nextents - 1) * sizeof(xfs_extent_t)));
-		efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+		efdp = kmem_zalloc(size, KM_SLEEP);
 	} else {
-		efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
-							     KM_SLEEP);
+		efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
 	}
 
 	xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
@@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t	*mp,
 	efdp->efd_format.efd_nextents = nextents;
 	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
 
-	return (efdp);
+	return efdp;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee660..9b715dce569 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
 #include "xfs.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
 	xfs_inode_t	*pip;	/* Parent directory inode pointer. */
 } fstrm_item_t;
 
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all.  The race condition this addresses is the following:
+ *
+ *  - The work queue scheduler fires and pulls a filestream directory cache
+ *    element off the LRU end of the cache for deletion, then gets pre-empted.
+ *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ *    remaining items from the cache and reallocates the mount point's per-ag
+ *    array, resetting all the counters to zero.
+ *  - The work queue thread resumes and calls the free function for the element
+ *    it started cleaning up earlier.  In the process it decrements the
+ *    filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno)
+{
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, agno);
+	atomic_dec(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+}
 
 /*
  * Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
 {
 	fstrm_item_t	*item  = (fstrm_item_t *)data;
 	xfs_inode_t	*ip = item->ip;
-	int ref;
 
 	ASSERT(ip->i_ino == ino);
 
 	xfs_iflags_clear(ip, XFS_IFILESTREAM);
 
 	/* Drop the reference taken on the AG when the item was added. */
-	ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
+	xfs_filestream_put_ag(ip->i_mount, item->ag);
 
-	ASSERT(ref >= 0);
 	TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
 		xfs_filestream_peek_ag(ip->i_mount, item->ag));
 
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5..09dd9af4543 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
 
 #endif
 
-/*
- * Allocation group filestream associations are tracked with per-ag atomic
- * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all.  The race condition this addresses is the following:
- *
- *  - The work queue scheduler fires and pulls a filestream directory cache
- *    element off the LRU end of the cache for deletion, then gets pre-empted.
- *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
- *    remaining items from the cache and reallocates the mount point's per-ag
- *    array, resetting all the counters to zero.
- *  - The work queue thread resumes and calls the free function for the element
- *    it started cleaning up earlier.  In the process it decrements the
- *    filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
- */
-/*
- * xfs_filestream_peek_ag is only used in tracing code
- */
-static inline int
-xfs_filestream_peek_ag(
-	xfs_mount_t	*mp,
-	xfs_agnumber_t	agno)
-{
-	struct xfs_perag *pag;
-	int		ret;
-
-	pag = xfs_perag_get(mp, agno);
-	ret = atomic_read(&pag->pagf_fstrms);
-	xfs_perag_put(pag);
-	return ret;
-}
-
-static inline int
-xfs_filestream_get_ag(
-	xfs_mount_t	*mp,
-	xfs_agnumber_t	agno)
-{
-	struct xfs_perag *pag;
-	int		ret;
-
-	pag = xfs_perag_get(mp, agno);
-	ret = atomic_inc_return(&pag->pagf_fstrms);
-	xfs_perag_put(pag);
-	return ret;
-}
-
-static inline int
-xfs_filestream_put_ag(
-	xfs_mount_t	*mp,
-	xfs_agnumber_t	agno)
-{
-	struct xfs_perag *pag;
-	int		ret;
-
-	pag = xfs_perag_get(mp, agno);
-	ret = atomic_dec_return(&pag->pagf_fstrms);
-	xfs_perag_put(pag);
-	return ret;
-}
-
 /* allocation selection flags */
 typedef enum xfs_fstrm_alloc {
 	XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5..8f6fc1a9638 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
 #define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
 #define BMV_IF_VALID	\
-	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
 
 /*	bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
@@ -291,9 +293,11 @@ typedef struct xfs_bstat {
 	__s32		bs_extsize;	/* extent size			*/
 	__s32		bs_extents;	/* number of extents		*/
 	__u32		bs_gen;		/* generation count		*/
-	__u16		bs_projid;	/* project id			*/
+	__u16		bs_projid_lo;	/* lower part of project id	*/
+#define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
 	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
-	unsigned char	bs_pad[12];	/* pad space, unused		*/
+	__u16		bs_projid_hi;	/* higher part of project id	*/
+	unsigned char	bs_pad[10];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
@@ -446,6 +450,7 @@ typedef struct xfs_handle {
 /*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
 /*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b..a7c116e814a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -148,12 +144,11 @@ xfs_growfs_data_private(
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
 	dpct = pct - mp->m_sb.sb_imax_pct;
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
 
 	new = nb;	/* use new as a temporary here */
@@ -601,39 +596,44 @@ out:
 		 * the extra reserve blocks from the reserve.....
 		 */
 		int error;
-		error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 fdblks_delta, 0);
 		if (error == ENOSPC)
 			goto retry;
 	}
 	return 0;
 }
 
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-	xfs_mount_t	*mp)
+	xfs_mount_t	*mp,
+	int		flags)
 {
 	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
 	int		error;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+					XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
 	}
 
-	ip = mp->m_rootip;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return error;
+	/* log the UUID because it is an unchanging field */
+	xfs_mod_sb(tp, XFS_SB_UUID);
+	if (flags & SYNC_WAIT)
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp, 0);
 }
 
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c..a786c5212c1 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c7142a064c4..0626a32c344 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -216,7 +212,7 @@ xfs_ialloc_inode_init(
 		 *	to log a whole cluster of inodes instead of all the
 		 *	individual transactions causing a lot of log traffic.
 		 */
-		xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
 			uint	isize = sizeof(struct xfs_dinode);
@@ -1217,7 +1213,6 @@ xfs_imap_lookup(
 	struct xfs_inobt_rec_incore rec;
 	struct xfs_btree_cur	*cur;
 	struct xfs_buf		*agbp;
-	xfs_agino_t		startino;
 	int			error;
 	int			i;
 
@@ -1231,13 +1226,13 @@ xfs_imap_lookup(
 	}
 
 	/*
-	 * derive and lookup the exact inode record for the given agino. If the
-	 * record cannot be found, then it's an invalid inode number and we
-	 * should abort.
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
 	 */
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-	startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
-	error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
 	if (!error) {
 		if (i)
 			error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1250,6 +1245,11 @@ xfs_imap_lookup(
 	if (error)
 		return error;
 
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+		return EINVAL;
+
 	/* for untrusted inodes check it is allocated first */
 	if ((flags & XFS_IGET_UNTRUSTED) &&
 	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af539..16921f55c54 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -187,38 +183,6 @@ xfs_inobt_key_diff(
 			  cur->bc_rec.i.ir_startino;
 }
 
-STATIC int
-xfs_inobt_kill_root(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp,
-	int			level,
-	union xfs_btree_ptr	*newroot)
-{
-	int			error;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-	XFS_BTREE_STATS_INC(cur, killroot);
-
-	/*
-	 * Update the root pointer, decreasing the level by 1 and then
-	 * free the old root.
-	 */
-	xfs_inobt_set_root(cur, newroot, -1);
-	error = xfs_inobt_free_block(cur, bp);
-	if (error) {
-		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-		return error;
-	}
-
-	XFS_BTREE_STATS_INC(cur, free);
-
-	cur->bc_bufs[level] = NULL;
-	cur->bc_nlevels--;
-
-	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-	return 0;
-}
-
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -313,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_inobt_set_root,
-	.kill_root		= xfs_inobt_kill_root,
 	.alloc_block		= xfs_inobt_alloc_block,
 	.free_block		= xfs_inobt_free_block,
 	.get_minrecs		= xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8f8b91be2c9..0cdd26932d8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
 	return ip;
 }
 
-STATIC void
+void
 xfs_inode_free(
 	struct xfs_inode	*ip)
 {
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
 			ip->i_flags &= ~XFS_INEW;
 			ip->i_flags |= XFS_IRECLAIMABLE;
 			__xfs_inode_set_reclaim_tag(pag, ip);
-			trace_xfs_iget_reclaim(ip);
+			trace_xfs_iget_reclaim_fail(ip);
 			goto out_error;
 		}
 
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
 			error = EAGAIN;
 			goto out_error;
 		}
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
 		/* We've got a live one. */
 		spin_unlock(&ip->i_flags_lock);
 		read_unlock(&pag->pag_ici_lock);
+		trace_xfs_iget_hit(ip);
 	}
 
 	if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
 	xfs_iflags_clear(ip, XFS_ISTALE);
 	XFS_STATS_INC(xs_ig_found);
 
-	trace_xfs_iget_found(ip);
 	return 0;
 
 out_error:
@@ -264,7 +261,6 @@ xfs_iget_cache_miss(
 {
 	struct xfs_inode	*ip;
 	int			error;
-	unsigned long		first_index, mask;
 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
 
 	ip = xfs_inode_alloc(mp, ino);
@@ -275,7 +271,7 @@ xfs_iget_cache_miss(
 	if (error)
 		goto out_destroy;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_iget_miss(ip);
 
 	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 		error = ENOENT;
@@ -301,8 +297,6 @@ xfs_iget_cache_miss(
 			BUG();
 	}
 
-	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
-	first_index = agino & mask;
 	write_lock(&pag->pag_ici_lock);
 
 	/* insert the new inode */
@@ -321,7 +315,6 @@ xfs_iget_cache_miss(
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
 
-	trace_xfs_iget_alloc(ip);
 	*ipp = ip;
 	return 0;
 
@@ -372,8 +365,8 @@ xfs_iget(
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 
-	/* the radix tree exists only in inode capable AGs */
-	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+	/* reject inode numbers outside existing AGs */
+	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
@@ -422,97 +415,6 @@ out_error_or_again:
 }
 
 /*
- * Decrement reference count of an inode structure and unlock it.
- *
- * ip -- the inode being released
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be released.  See the comment on xfs_iunlock() for a list
- *	 of valid values.
- */
-void
-xfs_iput(xfs_inode_t	*ip,
-	 uint		lock_flags)
-{
-	xfs_itrace_entry(ip);
-	xfs_iunlock(ip, lock_flags);
-	IRELE(ip);
-}
-
-/*
- * Special iput for brand-new inodes that are still locked
- */
-void
-xfs_iput_new(
-	xfs_inode_t	*ip,
-	uint		lock_flags)
-{
-	struct inode	*inode = VFS_I(ip);
-
-	xfs_itrace_entry(ip);
-
-	if ((ip->i_d.di_mode == 0)) {
-		ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-		make_bad_inode(inode);
-	}
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	if (lock_flags)
-		xfs_iunlock(ip, lock_flags);
-	IRELE(ip);
-}
-
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- *
- * Note: because we don't initialise everything on reallocation out
- * of the zone, we must ensure we nullify everything correctly before
- * freeing the structure.
- */
-void
-xfs_ireclaim(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_perag	*pag;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-
-	XFS_STATS_INC(xs_ig_reclaims);
-
-	/*
-	 * Remove the inode from the per-AG radix tree.
-	 *
-	 * Because radix_tree_delete won't complain even if the item was never
-	 * added to the tree assert that it's been there before to catch
-	 * problems with the inode life time early on.
-	 */
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	write_lock(&pag->pag_ici_lock);
-	if (!radix_tree_delete(&pag->pag_ici_root, agino))
-		ASSERT(0);
-	write_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-
-	/*
-	 * Here we do an (almost) spurious inode lock in order to coordinate
-	 * with inode cache radix tree lookups.  This is because the lookup
-	 * can reference the inodes in the cache without taking references.
-	 *
-	 * We make that OK here by ensuring that we wait until the inode is
-	 * unlocked after the lookup before we go ahead and free it.  We get
-	 * both the ilock and the iolock because the code may need to drop the
-	 * ilock one but will still hold the iolock.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_qm_dqdetach(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-	xfs_inode_free(ip);
-}
-
-/*
  * This is a wrapper routine around the xfs_ilock() routine
  * used to centralize some grungy code.  It is used in places
  * that wish to lock the inode solely for reading the extents.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b76a829d7e2..108c7a085f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
 #include "xfs_quota.h"
@@ -426,7 +422,7 @@ xfs_iformat(
 	if (!XFS_DFORK_Q(dip))
 		return 0;
 	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
 	ip->i_afp->if_ext_max =
 		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
 		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 	else {
 		real_size = roundup(size, 4);
-		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 	}
 	ifp->if_bytes = size;
 	ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
 	}
 
 	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -664,7 +660,8 @@ xfs_dinode_from_disk(
 	to->di_uid = be32_to_cpu(from->di_uid);
 	to->di_gid = be32_to_cpu(from->di_gid);
 	to->di_nlink = be32_to_cpu(from->di_nlink);
-	to->di_projid = be16_to_cpu(from->di_projid);
+	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
 	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -699,7 +696,8 @@ xfs_dinode_to_disk(
 	to->di_uid = cpu_to_be32(from->di_uid);
 	to->di_gid = cpu_to_be32(from->di_gid);
 	to->di_nlink = cpu_to_be32(from->di_nlink);
-	to->di_projid = cpu_to_be16(from->di_projid);
+	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 	to->di_flushiter = cpu_to_be16(from->di_flushiter);
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -878,7 +876,7 @@ xfs_iread(
 	if (ip->i_d.di_version == 1) {
 		ip->i_d.di_nlink = ip->i_d.di_onlink;
 		ip->i_d.di_onlink = 0;
-		ip->i_d.di_projid = 0;
+		xfs_set_projid(ip, 0);
 	}
 
 	ip->i_delayed_blks = 0;
@@ -922,7 +920,6 @@ xfs_iread_extents(
 	int		error;
 	xfs_ifork_t	*ifp;
 	xfs_extnum_t	nextents;
-	size_t		size;
 
 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -930,7 +927,6 @@ xfs_iread_extents(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-	size = nextents * sizeof(xfs_bmbt_rec_t);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
 	/*
@@ -988,8 +984,7 @@ xfs_ialloc(
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
-	cred_t		*cr,
-	xfs_prid_t	prid,
+	prid_t		prid,
 	int		okalloc,
 	xfs_buf_t	**ialloc_context,
 	boolean_t	*call_again,
@@ -1033,7 +1028,7 @@ xfs_ialloc(
 	ASSERT(ip->i_d.di_nlink == nlink);
 	ip->i_d.di_uid = current_fsuid();
 	ip->i_d.di_gid = current_fsgid();
-	ip->i_d.di_projid = prid;
+	xfs_set_projid(ip, prid);
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
 	/*
@@ -1226,7 +1221,7 @@ xfs_isize_check(
 				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
 			  map_first),
 			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-			 NULL, NULL))
+			 NULL))
 	    return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1460,7 +1455,7 @@ xfs_itruncate_finish(
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_transp == *tp);
 	ASSERT(ip->i_itemp != NULL);
-	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+	ASSERT(ip->i_itemp->ili_lock_flags == 0);
 
 
 	ntp = *tp;
@@ -1589,11 +1584,10 @@ xfs_itruncate_finish(
 		xfs_bmap_init(&free_list, &first_block);
 		error = xfs_bunmapi(ntp, ip,
 				    first_unmap_block, unmap_len,
-				    xfs_bmapi_aflag(fork) |
-				      (sync ? 0 : XFS_BMAPI_ASYNC),
+				    xfs_bmapi_aflag(fork),
 				    XFS_ITRUNC_MAX_EXTENTS,
 				    &first_block, &free_list,
-				    NULL, &done);
+				    &done);
 		if (error) {
 			/*
 			 * If the bunmapi call encounters an error,
@@ -1612,12 +1606,8 @@ xfs_itruncate_finish(
 		 */
 		error = xfs_bmap_finish(tp, &free_list, &committed);
 		ntp = *tp;
-		if (committed) {
-			/* link the inode into the next xact in the chain */
-			xfs_trans_ijoin(ntp, ip,
-					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-			xfs_trans_ihold(ntp, ip);
-		}
+		if (committed)
+			xfs_trans_ijoin(ntp, ip);
 
 		if (error) {
 			/*
@@ -1646,9 +1636,7 @@ xfs_itruncate_finish(
 		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
 
-		/* link the inode into the next transaction in the chain */
-		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ihold(ntp, ip);
+		xfs_trans_ijoin(ntp, ip);
 
 		if (error)
 			return error;
@@ -1927,6 +1915,11 @@ xfs_iunlink_remove(
 	return 0;
 }
 
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
@@ -1958,8 +1951,6 @@ xfs_ifree_cluster(
 	}
 
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
-		int	found = 0;
-
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
@@ -1978,23 +1969,25 @@ xfs_ifree_cluster(
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
 		 * stale. These will all have the flush locks held, so an
-		 * in-memory inode walk can't lock them.
+		 * in-memory inode walk can't lock them. By marking them all
+		 * stale first, we will not attempt to lock them in the loop
+		 * below as the XFS_ISTALE flag will be set.
 		 */
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 		while (lip) {
 			if (lip->li_type == XFS_LI_INODE) {
 				iip = (xfs_inode_log_item_t *)lip;
 				ASSERT(iip->ili_logged == 1);
-				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+				lip->li_cb = xfs_istale_done;
 				xfs_trans_ail_copy_lsn(mp->m_ail,
 							&iip->ili_flush_lsn,
 							&iip->ili_item.li_lsn);
 				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-				found++;
 			}
 			lip = lip->li_bio_list;
 		}
 
+
 		/*
 		 * For each inode in memory attempt to add it to the inode
 		 * buffer and set it up for being staled on buffer IO
@@ -2006,6 +1999,7 @@ xfs_ifree_cluster(
 		 * even trying to lock them.
 		 */
 		for (i = 0; i < ninodes; i++) {
+retry:
 			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2016,38 +2010,36 @@ xfs_ifree_cluster(
 				continue;
 			}
 
-			/* don't try to lock/unlock the current inode */
+			/*
+			 * Don't try to lock/unlock the current inode, but we
+			 * _cannot_ skip the other inodes that we did not find
+			 * in the list attached to the buffer and are not
+			 * already marked stale. If we can't lock it, back off
+			 * and retry.
+			 */
 			if (ip != free_ip &&
 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				read_unlock(&pag->pag_ici_lock);
-				continue;
+				delay(1);
+				goto retry;
 			}
 			read_unlock(&pag->pag_ici_lock);
 
-			if (!xfs_iflock_nowait(ip)) {
-				if (ip != free_ip)
-					xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				continue;
-			}
-
+			xfs_iflock(ip);
 			xfs_iflags_set(ip, XFS_ISTALE);
-			if (xfs_inode_clean(ip)) {
-				ASSERT(ip != free_ip);
-				xfs_ifunlock(ip);
-				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				continue;
-			}
 
+			/*
+			 * we don't need to attach clean inodes or those only
+			 * with unlogged changes (which we throw away, anyway).
+			 */
 			iip = ip->i_itemp;
-			if (!iip) {
-				/* inode with unlogged changes only */
+			if (!iip || xfs_inode_clean(ip)) {
 				ASSERT(ip != free_ip);
 				ip->i_update_core = 0;
 				xfs_ifunlock(ip);
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				continue;
 			}
-			found++;
 
 			iip->ili_last_fields = iip->ili_format.ilf_fields;
 			iip->ili_format.ilf_fields = 0;
@@ -2055,16 +2047,14 @@ xfs_ifree_cluster(
 			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
 						&iip->ili_item.li_lsn);
 
-			xfs_buf_attach_iodone(bp,
-				(void(*)(xfs_buf_t*,xfs_log_item_t*))
-				xfs_istale_done, (xfs_log_item_t *)iip);
+			xfs_buf_attach_iodone(bp, xfs_istale_done,
+						  &iip->ili_item);
 
 			if (ip != free_ip)
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		}
 
-		if (found)
-			xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
 
@@ -2203,7 +2193,7 @@ xfs_iroot_realloc(
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
+			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
 		}
@@ -2219,7 +2209,7 @@ xfs_iroot_realloc(
 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
 				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
-				KM_SLEEP);
+				KM_SLEEP | KM_NOFS);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2245,7 +2235,7 @@ xfs_iroot_realloc(
 	else
 		new_size = 0;
 	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_SLEEP);
+		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
 		/*
 		 * First copy over the btree block header.
 		 */
@@ -2349,7 +2339,8 @@ xfs_idata_realloc(
 		real_size = roundup(new_size, 4);
 		if (ifp->if_u1.if_data == NULL) {
 			ASSERT(ifp->if_real_bytes == 0);
-			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
 			/*
 			 * Only do the realloc if the underlying size
@@ -2360,11 +2351,12 @@ xfs_idata_realloc(
 					kmem_realloc(ifp->if_u1.if_data,
 							real_size,
 							ifp->if_real_bytes,
-							KM_SLEEP);
+							KM_SLEEP | KM_NOFS);
 			}
 		} else {
 			ASSERT(ifp->if_real_bytes == 0);
-			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+			ifp->if_u1.if_data = kmem_alloc(real_size,
+							KM_SLEEP | KM_NOFS);
 			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
 				ifp->if_bytes);
 		}
@@ -2731,11 +2723,10 @@ cluster_corrupt_out:
 		 * mark it as stale and brelse.
 		 */
 		if (XFS_BUF_IODONE_FUNC(bp)) {
-			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
 			XFS_BUF_UNDONE(bp);
 			XFS_BUF_STALE(bp);
 			XFS_BUF_ERROR(bp,EIO);
-			xfs_biodone(bp);
+			xfs_buf_ioend(bp, 0);
 		} else {
 			XFS_BUF_STALE(bp);
 			xfs_buf_relse(bp);
@@ -3018,7 +3009,7 @@ xfs_iflush_int(
 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 			memset(&(dip->di_pad[0]), 0,
 			      sizeof(dip->di_pad));
-			ASSERT(ip->i_d.di_projid == 0);
+			ASSERT(xfs_get_projid(ip) == 0);
 		}
 	}
 
@@ -3069,8 +3060,7 @@ xfs_iflush_int(
 		 * and unlock the inode's flush lock when the inode is
 		 * completely written to disk.
 		 */
-		xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
-				      xfs_iflush_done, (xfs_log_item_t *)iip);
+		xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
 
 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
 		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3514,13 +3504,11 @@ xfs_iext_remove_indirect(
 	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
 	xfs_extnum_t	nex1;		/* number of extents before idx */
 	xfs_extnum_t	nex2;		/* extents after idx + count */
-	int		nlists;		/* entries in indirection array */
 	int		page_idx = idx;	/* index in target extent list */
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
 	ASSERT(erp != NULL);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	nex1 = page_idx;
 	ext_cnt = count;
 	while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 78550df13cd..fb2ca2e4cdc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
 	__uint32_t	di_uid;		/* owner's user id */
 	__uint32_t	di_gid;		/* owner's group id */
 	__uint32_t	di_nlink;	/* number of links to file */
-	__uint16_t	di_projid;	/* owner's project id */
-	__uint8_t	di_pad[8];	/* unused, zeroed space */
+	__uint16_t	di_projid_lo;	/* lower part of owner's project id */
+	__uint16_t	di_projid_hi;	/* higher part of owner's project id */
+	__uint8_t	di_pad[6];	/* unused, zeroed space */
 	__uint16_t	di_flushiter;	/* incremented on flush */
 	xfs_ictimestamp_t di_atime;	/* time last accessed */
 	xfs_ictimestamp_t di_mtime;	/* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
 #ifdef __KERNEL__
 
 struct bhv_desc;
-struct cred;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 }
 
 /*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+	return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+		prid_t projid)
+{
+	ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+	ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+
+/*
  * Manage the i_flush queue embedded in the inode.  This completion
  * queue synchronizes processes attempting to flush the in-core
  * inode back to disk.
@@ -443,8 +462,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
  */
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **);
-void		xfs_iput(xfs_inode_t *, uint);
-void		xfs_iput_new(xfs_inode_t *, uint);
 void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
@@ -452,14 +469,14 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void		xfs_ireclaim(xfs_inode_t *);
+void		xfs_inode_free(struct xfs_inode *ip);
 
 /*
  * xfs_inode.c prototypes.
  */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-			   xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
-			   int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
+			   xfs_nlink_t, xfs_dev_t, prid_t, int,
+			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
 
 uint		xfs_ip2xflags(struct xfs_inode *);
 uint		xfs_dic2xflags(struct xfs_dinode *);
@@ -473,7 +490,6 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 void		xfs_iunpin_wait(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
-void		xfs_ichgtime(xfs_inode_t *, int);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
@@ -484,7 +500,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
 	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-	atomic_inc(&(VFS_I(ip)->i_count)); \
+	ihold(VFS_I(ip)); \
 	trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cf8249a6000..c7ac020705d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
 
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+
 /*
  * This returns the number of iovecs needed to log the given inode item.
  *
@@ -55,13 +51,11 @@ kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
  */
 STATIC uint
 xfs_inode_item_size(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	uint		nvecs;
-	xfs_inode_t	*ip;
-
-	ip = iip->ili_inode;
-	nvecs = 2;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	uint			nvecs = 2;
 
 	/*
 	 * Only log the data/extents/b-tree root if there is something
@@ -212,36 +206,23 @@ xfs_inode_item_size(
  */
 STATIC void
 xfs_inode_item_format(
-	xfs_inode_log_item_t	*iip,
-	xfs_log_iovec_t		*log_vector)
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*vecp)
 {
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
 	uint			nvecs;
-	xfs_log_iovec_t		*vecp;
-	xfs_inode_t		*ip;
 	size_t			data_bytes;
 	xfs_bmbt_rec_t		*ext_buffer;
-	int			nrecs;
 	xfs_mount_t		*mp;
 
-	ip = iip->ili_inode;
-	vecp = log_vector;
-
-	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
+	vecp->i_addr = &iip->ili_format;
 	vecp->i_len  = sizeof(xfs_inode_log_format_t);
 	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
 	vecp++;
 	nvecs	     = 1;
 
 	/*
-	 * Make sure the linux inode is dirty. We do this before
-	 * clearing i_update_core as the VFS will call back into
-	 * XFS here and set i_update_core, so we need to dirty the
-	 * inode first so that the ordering of i_update_core and
-	 * unlogged modifications still works as described below.
-	 */
-	xfs_mark_inode_dirty_sync(ip);
-
-	/*
 	 * Clear i_update_core if the timestamps (or any other
 	 * non-transactional modification) need flushing/logging
 	 * and we're about to log them with the rest of the core.
@@ -277,7 +258,7 @@ xfs_inode_item_format(
 	 */
 	xfs_synchronize_times(ip);
 
-	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+	vecp->i_addr = &ip->i_d;
 	vecp->i_len  = sizeof(struct xfs_icdinode);
 	vecp->i_type = XLOG_REG_TYPE_ICORE;
 	vecp++;
@@ -323,18 +304,17 @@ xfs_inode_item_format(
 			ASSERT(ip->i_df.if_u1.if_extents != NULL);
 			ASSERT(ip->i_d.di_nextents > 0);
 			ASSERT(iip->ili_extents_buf == NULL);
-			nrecs = ip->i_df.if_bytes /
-				(uint)sizeof(xfs_bmbt_rec_t);
-			ASSERT(nrecs > 0);
+			ASSERT((ip->i_df.if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t)) > 0);
 #ifdef XFS_NATIVE_HOST
-			if (nrecs == ip->i_d.di_nextents) {
+                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                                               (uint)sizeof(xfs_bmbt_rec_t)) {
 				/*
 				 * There are no delayed allocation
 				 * extents, so just point to the
 				 * real extents array.
 				 */
-				vecp->i_addr =
-					(char *)(ip->i_df.if_u1.if_extents);
+				vecp->i_addr = ip->i_df.if_u1.if_extents;
 				vecp->i_len = ip->i_df.if_bytes;
 				vecp->i_type = XLOG_REG_TYPE_IEXT;
 			} else
@@ -352,7 +332,7 @@ xfs_inode_item_format(
 				ext_buffer = kmem_alloc(ip->i_df.if_bytes,
 					KM_SLEEP);
 				iip->ili_extents_buf = ext_buffer;
-				vecp->i_addr = (xfs_caddr_t)ext_buffer;
+				vecp->i_addr = ext_buffer;
 				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 						XFS_DATA_FORK);
 				vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +351,7 @@ xfs_inode_item_format(
 		if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
 			ASSERT(ip->i_df.if_broot_bytes > 0);
 			ASSERT(ip->i_df.if_broot != NULL);
-			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+			vecp->i_addr = ip->i_df.if_broot;
 			vecp->i_len = ip->i_df.if_broot_bytes;
 			vecp->i_type = XLOG_REG_TYPE_IBROOT;
 			vecp++;
@@ -389,7 +369,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_df.if_u1.if_data != NULL);
 			ASSERT(ip->i_d.di_size > 0);
 
-			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+			vecp->i_addr = ip->i_df.if_u1.if_data;
 			/*
 			 * Round i_bytes up to a word boundary.
 			 * The underlying memory is guaranteed to
@@ -437,7 +417,7 @@ xfs_inode_item_format(
 	 * Assert that no attribute-related log flags are set.
 	 */
 	if (!XFS_IFORK_Q(ip)) {
-		ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+		ASSERT(nvecs == lip->li_desc->lid_size);
 		iip->ili_format.ilf_size = nvecs;
 		ASSERT(!(iip->ili_format.ilf_fields &
 			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +429,21 @@ xfs_inode_item_format(
 		ASSERT(!(iip->ili_format.ilf_fields &
 			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
 		if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-			ASSERT(ip->i_afp->if_bytes > 0);
-			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-			ASSERT(ip->i_d.di_anextents > 0);
 #ifdef DEBUG
-			nrecs = ip->i_afp->if_bytes /
+			int nrecs = ip->i_afp->if_bytes /
 				(uint)sizeof(xfs_bmbt_rec_t);
-#endif
 			ASSERT(nrecs > 0);
 			ASSERT(nrecs == ip->i_d.di_anextents);
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_anextents > 0);
+#endif
 #ifdef XFS_NATIVE_HOST
 			/*
 			 * There are not delayed allocation extents
 			 * for attributes, so just point at the array.
 			 */
-			vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+			vecp->i_addr = ip->i_afp->if_u1.if_extents;
 			vecp->i_len = ip->i_afp->if_bytes;
 #else
 			ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +453,7 @@ xfs_inode_item_format(
 			ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
 				KM_SLEEP);
 			iip->ili_aextents_buf = ext_buffer;
-			vecp->i_addr = (xfs_caddr_t)ext_buffer;
+			vecp->i_addr = ext_buffer;
 			vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 					XFS_ATTR_FORK);
 #endif
@@ -490,7 +470,7 @@ xfs_inode_item_format(
 		if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
 			ASSERT(ip->i_afp->if_broot_bytes > 0);
 			ASSERT(ip->i_afp->if_broot != NULL);
-			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+			vecp->i_addr = ip->i_afp->if_broot;
 			vecp->i_len = ip->i_afp->if_broot_bytes;
 			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
 			vecp++;
@@ -506,7 +486,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_afp->if_bytes > 0);
 			ASSERT(ip->i_afp->if_u1.if_data != NULL);
 
-			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+			vecp->i_addr = ip->i_afp->if_u1.if_data;
 			/*
 			 * Round i_bytes up to a word boundary.
 			 * The underlying memory is guaranteed to
@@ -528,7 +508,7 @@ xfs_inode_item_format(
 		break;
 	}
 
-	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+	ASSERT(nvecs == lip->li_desc->lid_size);
 	iip->ili_format.ilf_size = nvecs;
 }
 
@@ -539,12 +519,14 @@ xfs_inode_item_format(
  */
 STATIC void
 xfs_inode_item_pin(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
-	atomic_inc(&iip->ili_inode->i_pincount);
+	trace_xfs_inode_pin(ip, _RET_IP_);
+	atomic_inc(&ip->i_pincount);
 }
 
 
@@ -554,12 +536,12 @@ xfs_inode_item_pin(
  *
  * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
  */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip,
+	int			remove)
 {
-	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
 
 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
@@ -567,15 +549,6 @@ xfs_inode_item_unpin(
 		wake_up(&ip->i_ipin_wait);
 }
 
-/* ARGSUSED */
-STATIC void
-xfs_inode_item_unpin_remove(
-	xfs_inode_log_item_t	*iip,
-	xfs_trans_t		*tp)
-{
-	xfs_inode_item_unpin(iip);
-}
-
 /*
  * This is called to attempt to lock the inode associated with this
  * inode log item, in preparation for the push routine which does the actual
@@ -591,19 +564,16 @@ xfs_inode_item_unpin_remove(
  */
 STATIC uint
 xfs_inode_item_trylock(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	register xfs_inode_t	*ip;
-
-	ip = iip->ili_inode;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
 
-	if (xfs_ipincount(ip) > 0) {
+	if (xfs_ipincount(ip) > 0)
 		return XFS_ITEM_PINNED;
-	}
 
-	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 		return XFS_ITEM_LOCKED;
-	}
 
 	if (!xfs_iflock_nowait(ip)) {
 		/*
@@ -629,7 +599,7 @@ xfs_inode_item_trylock(
 	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		ASSERT(iip->ili_format.ilf_fields != 0);
 		ASSERT(iip->ili_logged == 0);
-		ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+		ASSERT(lip->li_flags & XFS_LI_IN_AIL);
 	}
 #endif
 	return XFS_ITEM_SUCCESS;
@@ -643,26 +613,18 @@ xfs_inode_item_trylock(
  */
 STATIC void
 xfs_inode_item_unlock(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	uint		hold;
-	uint		iolocked;
-	uint		lock_flags;
-	xfs_inode_t	*ip;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	unsigned short		lock_flags;
 
-	ASSERT(iip != NULL);
 	ASSERT(iip->ili_inode->i_itemp != NULL);
 	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
-	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-		  XFS_ILI_IOLOCKED_EXCL)) ||
-	       xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
-	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-		  XFS_ILI_IOLOCKED_SHARED)) ||
-	       xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
+
 	/*
 	 * Clear the transaction pointer in the inode.
 	 */
-	ip = iip->ili_inode;
 	ip->i_transp = NULL;
 
 	/*
@@ -686,34 +648,11 @@ xfs_inode_item_unlock(
 		iip->ili_aextents_buf = NULL;
 	}
 
-	/*
-	 * Figure out if we should unlock the inode or not.
-	 */
-	hold = iip->ili_flags & XFS_ILI_HOLD;
-
-	/*
-	 * Before clearing out the flags, remember whether we
-	 * are holding the inode's IO lock.
-	 */
-	iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
-
-	/*
-	 * Clear out the fields of the inode log item particular
-	 * to the current transaction.
-	 */
-	iip->ili_flags = 0;
-
-	/*
-	 * Unlock the inode if XFS_ILI_HOLD was not set.
-	 */
-	if (!hold) {
-		lock_flags = XFS_ILOCK_EXCL;
-		if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
-			lock_flags |= XFS_IOLOCK_EXCL;
-		} else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
-			lock_flags |= XFS_IOLOCK_SHARED;
-		}
-		xfs_iput(iip->ili_inode, lock_flags);
+	lock_flags = iip->ili_lock_flags;
+	iip->ili_lock_flags = 0;
+	if (lock_flags) {
+		xfs_iunlock(iip->ili_inode, lock_flags);
+		IRELE(iip->ili_inode);
 	}
 }
 
@@ -725,13 +664,12 @@ xfs_inode_item_unlock(
  * is the only one that matters.  Therefore, simply return the
  * given lsn.
  */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
-	xfs_inode_log_item_t	*iip,
+	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
-	return (lsn);
+	return lsn;
 }
 
 /*
@@ -743,13 +681,12 @@ xfs_inode_item_committed(
  */
 STATIC void
 xfs_inode_item_pushbuf(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	xfs_inode_t	*ip;
-	xfs_mount_t	*mp;
-	xfs_buf_t	*bp;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+	struct xfs_buf		*bp;
 
-	ip = iip->ili_inode;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
 	/*
@@ -757,14 +694,13 @@ xfs_inode_item_pushbuf(
 	 * inode was taken off the AIL. So, just get out.
 	 */
 	if (completion_done(&ip->i_flush) ||
-	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return;
 	}
 
-	mp = ip->i_mount;
-	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
-		    iip->ili_format.ilf_len, XBF_TRYLOCK);
+	bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
+			iip->ili_format.ilf_len, XBF_TRYLOCK);
 
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	if (!bp)
@@ -772,10 +708,8 @@ xfs_inode_item_pushbuf(
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
 	xfs_buf_relse(bp);
-	return;
 }
 
-
 /*
  * This is called to asynchronously write the inode associated with this
  * inode log item out to disk. The inode will already have been locked by
@@ -783,14 +717,14 @@ xfs_inode_item_pushbuf(
  */
 STATIC void
 xfs_inode_item_push(
-	xfs_inode_log_item_t	*iip)
+	struct xfs_log_item	*lip)
 {
-	xfs_inode_t	*ip;
-
-	ip = iip->ili_inode;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 	ASSERT(!completion_done(&ip->i_flush));
+
 	/*
 	 * Since we were able to lock the inode's flush lock and
 	 * we found it on the AIL, the inode must be dirty.  This
@@ -813,43 +747,34 @@ xfs_inode_item_push(
 	 */
 	(void) xfs_iflush(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	return;
 }
 
 /*
  * XXX rcc - this one really has to do something.  Probably needs
  * to stamp in a new field in the incore inode.
  */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_committing(
-	xfs_inode_log_item_t	*iip,
+	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
-	iip->ili_last_lsn = lsn;
-	return;
+	INODE_ITEM(lip)->ili_last_lsn = lsn;
 }
 
 /*
  * This is the ops vector shared by all buf log items.
  */
 static struct xfs_item_ops xfs_inode_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_inode_item_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
-					xfs_inode_item_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_inode_item_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_inode_item_committing
+	.iop_size	= xfs_inode_item_size,
+	.iop_format	= xfs_inode_item_format,
+	.iop_pin	= xfs_inode_item_pin,
+	.iop_unpin	= xfs_inode_item_unpin,
+	.iop_trylock	= xfs_inode_item_trylock,
+	.iop_unlock	= xfs_inode_item_unlock,
+	.iop_committed	= xfs_inode_item_committed,
+	.iop_push	= xfs_inode_item_push,
+	.iop_pushbuf	= xfs_inode_item_pushbuf,
+	.iop_committing = xfs_inode_item_committing
 };
 
 
@@ -858,10 +783,10 @@ static struct xfs_item_ops xfs_inode_item_ops = {
  */
 void
 xfs_inode_item_init(
-	xfs_inode_t	*ip,
-	xfs_mount_t	*mp)
+	struct xfs_inode	*ip,
+	struct xfs_mount	*mp)
 {
-	xfs_inode_log_item_t	*iip;
+	struct xfs_inode_log_item *iip;
 
 	ASSERT(ip->i_itemp == NULL);
 	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
@@ -899,14 +824,14 @@ xfs_inode_item_destroy(
  * from the AIL if it has not been re-logged, and unlocking the inode's
  * flush lock.
  */
-/*ARGSUSED*/
 void
 xfs_iflush_done(
-	xfs_buf_t		*bp,
-	xfs_inode_log_item_t	*iip)
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
 {
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	xfs_inode_t		*ip = iip->ili_inode;
-	struct xfs_ail		*ailp = iip->ili_item.li_ailp;
+	struct xfs_ail		*ailp = lip->li_ailp;
 
 	/*
 	 * We only want to pull the item from the AIL if it is
@@ -917,12 +842,11 @@ xfs_iflush_done(
 	 * the lock since it's cheaper, and then we recheck while
 	 * holding the lock before removing the inode from the AIL.
 	 */
-	if (iip->ili_logged &&
-	    (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
+	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
 		spin_lock(&ailp->xa_lock);
-		if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+		if (lip->li_lsn == iip->ili_flush_lsn) {
 			/* xfs_trans_ail_delete() drops the AIL lock. */
-			xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
+			xfs_trans_ail_delete(ailp, lip);
 		} else {
 			spin_unlock(&ailp->xa_lock);
 		}
@@ -940,8 +864,6 @@ xfs_iflush_done(
 	 * Release the inode's flush lock since we're done with it.
 	 */
 	xfs_ifunlock(ip);
-
-	return;
 }
 
 /*
@@ -957,10 +879,8 @@ xfs_iflush_abort(
 	xfs_inode_t		*ip)
 {
 	xfs_inode_log_item_t	*iip = ip->i_itemp;
-	xfs_mount_t		*mp;
 
 	iip = ip->i_itemp;
-	mp = ip->i_mount;
 	if (iip) {
 		struct xfs_ail	*ailp = iip->ili_item.li_ailp;
 		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -991,10 +911,10 @@ xfs_iflush_abort(
 
 void
 xfs_istale_done(
-	xfs_buf_t		*bp,
-	xfs_inode_log_item_t	*iip)
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip)
 {
-	xfs_iflush_abort(iip->ili_inode);
+	xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
 }
 
 /*
@@ -1007,9 +927,8 @@ xfs_inode_item_format_convert(
 	xfs_inode_log_format_t	*in_f)
 {
 	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
-		xfs_inode_log_format_32_t *in_f32;
+		xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
 
-		in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
 		in_f->ilf_type = in_f32->ilf_type;
 		in_f->ilf_size = in_f32->ilf_size;
 		in_f->ilf_fields = in_f32->ilf_fields;
@@ -1025,9 +944,8 @@ xfs_inode_item_format_convert(
 		in_f->ilf_boffset = in_f32->ilf_boffset;
 		return 0;
 	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
-		xfs_inode_log_format_64_t *in_f64;
+		xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
 
-		in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
 		in_f->ilf_type = in_f64->ilf_type;
 		in_f->ilf_size = in_f64->ilf_size;
 		in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecd..d3dee61e6d9 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
 				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
 				 XFS_ILOG_ABROOT)
 
-#define	XFS_ILI_HOLD		0x1
-#define	XFS_ILI_IOLOCKED_EXCL	0x2
-#define	XFS_ILI_IOLOCKED_SHARED	0x4
-
-#define	XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
-
 static inline int xfs_ilog_fbroot(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
 	struct xfs_inode	*ili_inode;	   /* inode ptr */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
-	unsigned short		ili_flags;	   /* misc flags */
+	unsigned short		ili_lock_flags;	   /* lock flags */
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	struct xfs_bmbt_rec	*ili_extents_buf;  /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
 
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
-extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
 extern void xfs_iflush_abort(struct xfs_inode *);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
 					 xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ef14943829d..20576146369 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -123,7 +118,7 @@ xfs_iomap(
 	error = xfs_bmapi(NULL, ip, offset_fsb,
 			(xfs_filblks_t)(end_fsb - offset_fsb),
 			bmapi_flags,  NULL, 0, imap,
-			nimaps, NULL, NULL);
+			nimaps, NULL);
 
 	if (error)
 		goto out;
@@ -138,7 +133,7 @@ xfs_iomap(
 			break;
 		}
 
-		if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+		if (flags & BMAPI_DIRECT) {
 			error = xfs_iomap_write_direct(ip, offset, count, flags,
 						       imap, nimaps);
 		} else {
@@ -247,7 +242,7 @@ xfs_iomap_write_direct(
 	xfs_off_t	offset,
 	size_t		count,
 	int		flags,
-	xfs_bmbt_irec_t *ret_imap,
+	xfs_bmbt_irec_t *imap,
 	int		*nmaps)
 {
 	xfs_mount_t	*mp = ip->i_mount;
@@ -261,7 +256,6 @@ xfs_iomap_write_direct(
 	int		quota_flag;
 	int		rt;
 	xfs_trans_t	*tp;
-	xfs_bmbt_irec_t imap;
 	xfs_bmap_free_t free_list;
 	uint		qblocks, resblks, resrtextents;
 	int		committed;
@@ -285,10 +279,10 @@ xfs_iomap_write_direct(
 		if (error)
 			goto error_out;
 	} else {
-		if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
+		if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
-					ret_imap->br_blockcount +
-					ret_imap->br_startoff);
+					imap->br_blockcount +
+					imap->br_startoff);
 	}
 	count_fsb = last_fsb - offset_fsb;
 	ASSERT(count_fsb > 0);
@@ -334,20 +328,22 @@ xfs_iomap_write_direct(
 	if (error)
 		goto error1;
 
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 
 	bmapi_flag = XFS_BMAPI_WRITE;
 	if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
-	 * Issue the xfs_bmapi() call to allocate the blocks
+	 * Issue the xfs_bmapi() call to allocate the blocks.
+	 *
+	 * From this point onwards we overwrite the imap pointer that the
+	 * caller gave to us.
 	 */
 	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-		&firstfsb, 0, &imap, &nimaps, &free_list, NULL);
+		&firstfsb, 0, imap, &nimaps, &free_list);
 	if (error)
 		goto error0;
 
@@ -369,12 +365,11 @@ xfs_iomap_write_direct(
 		goto error_out;
 	}
 
-	if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-		error = xfs_cmn_err_fsblock_zero(ip, &imap);
+	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+		error = xfs_cmn_err_fsblock_zero(ip, imap);
 		goto error_out;
 	}
 
-	*ret_imap = imap;
 	*nmaps = 1;
 	return 0;
 
@@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
 		imaps = nimaps;
 		firstblock = NULLFSBLOCK;
 		error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
-				  &firstblock, 0, imap, &imaps, NULL, NULL);
+				  &firstblock, 0, imap, &imaps, NULL);
 		if (error)
 			return error;
 		for (n = 0; n < imaps; n++) {
@@ -500,7 +495,7 @@ retry:
 			  (xfs_filblks_t)(last_fsb - offset_fsb),
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-			  &nimaps, NULL, NULL);
+			  &nimaps, NULL);
 	if (error && (error != ENOSPC))
 		return XFS_ERROR(error);
 
@@ -548,7 +543,7 @@ xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
-	xfs_bmbt_irec_t *map,
+	xfs_bmbt_irec_t *imap,
 	int		*retmap)
 {
 	xfs_mount_t	*mp = ip->i_mount;
@@ -557,7 +552,6 @@ xfs_iomap_write_allocate(
 	xfs_fsblock_t	first_block;
 	xfs_bmap_free_t	free_list;
 	xfs_filblks_t	count_fsb;
-	xfs_bmbt_irec_t	imap;
 	xfs_trans_t	*tp;
 	int		nimaps, committed;
 	int		error = 0;
@@ -573,8 +567,8 @@ xfs_iomap_write_allocate(
 		return XFS_ERROR(error);
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	count_fsb = map->br_blockcount;
-	map_start_fsb = map->br_startoff;
+	count_fsb = imap->br_blockcount;
+	map_start_fsb = imap->br_startoff;
 
 	XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
 
@@ -602,8 +596,7 @@ xfs_iomap_write_allocate(
 				return XFS_ERROR(error);
 			}
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(tp, ip);
+			xfs_trans_ijoin(tp, ip);
 
 			xfs_bmap_init(&free_list, &first_block);
 
@@ -654,10 +647,15 @@ xfs_iomap_write_allocate(
 				}
 			}
 
-			/* Go get the actual blocks */
+			/*
+			 * Go get the actual blocks.
+	 	 	 *
+			 * From this point onwards we overwrite the imap
+			 * pointer that the caller gave to us.
+			 */
 			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
 					XFS_BMAPI_WRITE, &first_block, 1,
-					&imap, &nimaps, &free_list, NULL);
+					imap, &nimaps, &free_list);
 			if (error)
 				goto trans_cancel;
 
@@ -676,13 +674,12 @@ xfs_iomap_write_allocate(
 		 * See if we were able to allocate an extent that
 		 * covers at least part of the callers request
 		 */
-		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-			return xfs_cmn_err_fsblock_zero(ip, &imap);
+		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+			return xfs_cmn_err_fsblock_zero(ip, imap);
 
-		if ((offset_fsb >= imap.br_startoff) &&
-		    (offset_fsb < (imap.br_startoff +
-				   imap.br_blockcount))) {
-			*map = imap;
+		if ((offset_fsb >= imap->br_startoff) &&
+		    (offset_fsb < (imap->br_startoff +
+				   imap->br_blockcount))) {
 			*retmap = 1;
 			XFS_STATS_INC(xs_xstrat_quick);
 			return 0;
@@ -692,8 +689,8 @@ xfs_iomap_write_allocate(
 		 * So far we have not mapped the requested part of the
 		 * file, just surrounding data, try again.
 		 */
-		count_fsb -= imap.br_blockcount;
-		map_start_fsb = imap.br_startoff + imap.br_blockcount;
+		count_fsb -= imap->br_blockcount;
+		map_start_fsb = imap->br_startoff + imap->br_blockcount;
 	}
 
 trans_cancel:
@@ -766,8 +763,7 @@ xfs_iomap_write_unwritten(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		/*
 		 * Modify the unwritten extent state of the buffer.
@@ -776,7 +772,7 @@ xfs_iomap_write_unwritten(
 		nimaps = 1;
 		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
 				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-				  1, &imap, &nimaps, &free_list, NULL);
+				  1, &imap, &nimaps, &free_list);
 		if (error)
 			goto error_on_bmapi_transaction;
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 81ac4afd45b..7748a430f50 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,17 +18,16 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-typedef enum {
-	/* base extent manipulation calls */
-	BMAPI_READ = (1 << 0),		/* read extents */
-	BMAPI_WRITE = (1 << 1),		/* create extents */
-	BMAPI_ALLOCATE = (1 << 2),	/* delayed allocate to real extents */
-	/* modifiers */
-	BMAPI_IGNSTATE = (1 << 4),	/* ignore unwritten state on read */
-	BMAPI_DIRECT = (1 << 5),	/* direct instead of buffered write */
-	BMAPI_MMAP = (1 << 6),		/* allocate for mmap write */
-	BMAPI_TRYLOCK = (1 << 7),	/* non-blocking request */
-} bmapi_flags_t;
+/* base extent manipulation calls */
+#define BMAPI_READ	(1 << 0)	/* read extents */
+#define BMAPI_WRITE	(1 << 1)	/* create extents */
+#define BMAPI_ALLOCATE	(1 << 2)	/* delayed allocate to real extents */
+
+/* modifiers */
+#define BMAPI_IGNSTATE	(1 << 4)	/* ignore unwritten state on read */
+#define BMAPI_DIRECT	(1 << 5)	/* direct instead of buffered write */
+#define BMAPI_MMA	(1 << 6)	/* allocate for mmap write */
+#define BMAPI_TRYLOCK	(1 << 7)	/* non-blocking request */
 
 #define BMAPI_FLAGS \
 	{ BMAPI_READ,		"READ" }, \
@@ -36,7 +35,6 @@ typedef enum {
 	{ BMAPI_ALLOCATE,	"ALLOCATE" }, \
 	{ BMAPI_IGNSTATE,	"IGNSTATE" }, \
 	{ BMAPI_DIRECT,		"DIRECT" }, \
-	{ BMAPI_MMAP,		"MMAP" }, \
 	{ BMAPI_TRYLOCK,	"TRYLOCK" }
 
 struct xfs_inode;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2b86f861051..dc1882adaf5 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_btree.h"
+#include "xfs_trace.h"
 
 STATIC int
 xfs_internal_inum(
@@ -95,7 +92,8 @@ xfs_bulkstat_one_int(
 	 * further change.
 	 */
 	buf->bs_nlink = dic->di_nlink;
-	buf->bs_projid = dic->di_projid;
+	buf->bs_projid_lo = dic->di_projid_lo;
+	buf->bs_projid_hi = dic->di_projid_hi;
 	buf->bs_ino = ino;
 	buf->bs_mode = dic->di_mode;
 	buf->bs_uid = dic->di_uid;
@@ -143,7 +141,8 @@ xfs_bulkstat_one_int(
 		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
 		break;
 	}
-	xfs_iput(ip, XFS_ILOCK_SHARED);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	IRELE(ip);
 
 	error = formatter(buffer, ubsize, ubused, buf);
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5215abc8023..cee4ab9f8a9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_rw.h"
@@ -337,7 +333,6 @@ xfs_log_reserve(
 	int			retval = 0;
 
 	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
 
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return XFS_ERROR(EIO);
@@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 				.magic = XLOG_UNMOUNT_TYPE,
 			};
 			struct xfs_log_iovec reg = {
-				.i_addr = (void *)&magic,
+				.i_addr = &magic,
 				.i_len = sizeof(magic),
 				.i_type = XLOG_REG_TYPE_UNMOUNT,
 			};
@@ -922,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
 	l = iclog->ic_log;
 
 	/*
-	 * If the _XFS_BARRIER_FAILED flag was set by a lower
-	 * layer, it means the underlying device no longer supports
-	 * barrier I/O. Warn loudly and turn off barriers.
-	 */
-	if (bp->b_flags & _XFS_BARRIER_FAILED) {
-		bp->b_flags &= ~_XFS_BARRIER_FAILED;
-		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		xfs_fs_cmn_err(CE_WARN, l->l_mp,
-				"xlog_iodone: Barriers are no longer supported"
-				" by device. Disabling barriers\n");
-	}
-
-	/*
 	 * Race to shutdown the filesystem if we see an error.
 	 */
 	if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1047,7 +1029,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_in_core_t		*iclog, *prev_iclog=NULL;
 	xfs_buf_t		*bp;
 	int			i;
-	int			iclogsize;
 	int			error = ENOMEM;
 	uint			log2_size = 0;
 
@@ -1127,7 +1108,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	 * with different amounts of memory.  See the definition of
 	 * xlog_in_core_t in xfs_log_priv.h for details.
 	 */
-	iclogsize = log->l_iclog_size;
 	ASSERT(log->l_iclog_size >= 4096);
 	for (i=0; i < log->l_iclog_bufs; i++) {
 		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1138,7 +1118,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
-		bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+						log->l_iclog_size, 0);
 		if (!bp)
 			goto out_free_iclog;
 		if (!XFS_BUF_CPSEMA(bp))
@@ -1316,7 +1297,7 @@ xlog_bdstrat(
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		XFS_BUF_ERROR(bp, EIO);
 		XFS_BUF_STALE(bp);
-		xfs_biodone(bp);
+		xfs_buf_ioend(bp, 0);
 		/*
 		 * It would seem logical to return EIO here, but we rely on
 		 * the log state machine to propagate I/O errors instead of
@@ -1428,11 +1409,8 @@ xlog_sync(xlog_t		*log,
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_ASYNC(bp);
 	bp->b_flags |= XBF_LOG_BUFFER;
-	/*
-	 * Do an ordered write for the log block.
-	 * Its unnecessary to flush the first split block in the log wrap case.
-	 */
-	if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
+
+	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
 		XFS_BUF_ORDERED(bp);
 
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
@@ -3025,7 +3003,8 @@ _xfs_log_force(
 
 	XFS_STATS_INC(xs_log_force);
 
-	xlog_cil_push(log, 1);
+	if (log->l_cilp)
+		xlog_cil_force(log);
 
 	spin_lock(&log->l_icloglock);
 
@@ -3177,7 +3156,7 @@ _xfs_log_force_lsn(
 	XFS_STATS_INC(xs_log_force);
 
 	if (log->l_cilp) {
-		lsn = xlog_cil_push_lsn(log, lsn);
+		lsn = xlog_cil_force_lsn(log, lsn);
 		if (lsn == NULLCOMMITLSN)
 			return 0;
 	}
@@ -3734,7 +3713,7 @@ xfs_log_force_umount(
 	 * call below.
 	 */
 	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
-		xlog_cil_push(log, 1);
+		xlog_cil_force(log);
 
 	/*
 	 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 04c78e642cc..916eb7db14d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -55,14 +55,10 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 /*
  * Flags to xfs_log_reserve()
  *
- *	XFS_LOG_SLEEP:	 If space is not available, sleep (default)
- *	XFS_LOG_NOSLEEP: If space is not available, return error
  *	XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
  *		performed against this type of reservation, the reservation
  *		is not decreased.  Long running transactions should use this.
  */
-#define XFS_LOG_SLEEP		0x0
-#define XFS_LOG_NOSLEEP		0x1
 #define XFS_LOG_PERM_RESERV	0x2
 
 /*
@@ -104,7 +100,7 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_MAX		19
 
 typedef struct xfs_log_iovec {
-	xfs_caddr_t	i_addr;		/* beginning address of region */
+	void		*i_addr;	/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
 	uint		i_type;		/* type of region */
 } xfs_log_iovec_t;
@@ -201,9 +197,4 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 #endif
-
-
-extern int xlog_debug;		/* set to 1 to enable real log */
-
-
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index bb17cc044bf..23d6ceb5e97 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -26,8 +26,6 @@
 #include "xfs_log_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
@@ -70,6 +68,7 @@ xlog_cil_init(
 	ctx->sequence = 1;
 	ctx->cil = cil;
 	cil->xc_ctx = ctx;
+	cil->xc_current_sequence = ctx->sequence;
 
 	cil->xc_log = log;
 	log->l_cilp = cil;
@@ -147,102 +146,6 @@ xlog_cil_init_post_recovery(
 }
 
 /*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-	struct log		*log,
-	struct xlog_ticket	*ticket,
-	struct xfs_log_item	*item,
-	struct xfs_log_vec	*lv)
-{
-	struct xfs_cil		*cil = log->l_cilp;
-	struct xfs_log_vec	*old = lv->lv_item->li_lv;
-	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
-	int			len;
-	int			diff_iovecs;
-	int			iclog_space;
-
-	if (old) {
-		/* existing lv on log item, space used is a delta */
-		ASSERT(!list_empty(&item->li_cil));
-		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-
-		len = lv->lv_buf_len - old->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-		kmem_free(old->lv_buf);
-		kmem_free(old);
-	} else {
-		/* new lv, must pin the log item */
-		ASSERT(!lv->lv_item->li_lv);
-		ASSERT(list_empty(&item->li_cil));
-
-		len = lv->lv_buf_len;
-		diff_iovecs = lv->lv_niovecs;
-		IOP_PIN(lv->lv_item);
-
-	}
-	len += diff_iovecs * sizeof(xlog_op_header_t);
-
-	/* attach new log vector to log item */
-	lv->lv_item->li_lv = lv;
-
-	spin_lock(&cil->xc_cil_lock);
-	list_move_tail(&item->li_cil, &cil->xc_cil);
-	ctx->nvecs += diff_iovecs;
-
-	/*
-	 * If this is the first time the item is being committed to the CIL,
-	 * store the sequence number on the log item so we can tell
-	 * in future commits whether this is the first checkpoint the item is
-	 * being committed into.
-	 */
-	if (!item->li_seq)
-		item->li_seq = ctx->sequence;
-
-	/*
-	 * Now transfer enough transaction reservation to the context ticket
-	 * for the checkpoint. The context ticket is special - the unit
-	 * reservation has to grow as well as the current reservation as we
-	 * steal from tickets so we can correctly determine the space used
-	 * during the transaction commit.
-	 */
-	if (ctx->ticket->t_curr_res == 0) {
-		/* first commit in checkpoint, steal the header reservation */
-		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-		ticket->t_curr_res -= ctx->ticket->t_unit_res;
-	}
-
-	/* do we need space for more log record headers? */
-	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-	if (len > 0 && (ctx->space_used / iclog_space !=
-				(ctx->space_used + len) / iclog_space)) {
-		int hdrs;
-
-		hdrs = (len + iclog_space - 1) / iclog_space;
-		/* need to take into account split region headers, too */
-		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-		ctx->ticket->t_unit_res += hdrs;
-		ctx->ticket->t_curr_res += hdrs;
-		ticket->t_curr_res -= hdrs;
-		ASSERT(ticket->t_curr_res >= len);
-	}
-	ticket->t_curr_res -= len;
-	ctx->space_used += len;
-
-	spin_unlock(&cil->xc_cil_lock);
-}
-
-/*
  * Format log item into a flat buffers
  *
  * For delayed logging, we need to hold a formatted buffer containing all the
@@ -271,15 +174,10 @@ xlog_cil_insert(
 static void
 xlog_cil_format_items(
 	struct log		*log,
-	struct xfs_log_vec	*log_vector,
-	struct xlog_ticket	*ticket,
-	xfs_lsn_t		*start_lsn)
+	struct xfs_log_vec	*log_vector)
 {
 	struct xfs_log_vec *lv;
 
-	if (start_lsn)
-		*start_lsn = log->l_cilp->xc_ctx->sequence;
-
 	ASSERT(log_vector);
 	for (lv = log_vector; lv; lv = lv->lv_next) {
 		void	*ptr;
@@ -292,7 +190,7 @@ xlog_cil_format_items(
 			len += lv->lv_iovecp[index].i_len;
 
 		lv->lv_buf_len = len;
-		lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
 		ptr = lv->lv_buf;
 
 		for (index = 0; index < lv->lv_niovecs; index++) {
@@ -303,97 +201,153 @@ xlog_cil_format_items(
 			ptr += vec->i_len;
 		}
 		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
-
-		xlog_cil_insert(log, ticket, lv->lv_item, lv);
 	}
 }
 
-static void
-xlog_cil_free_logvec(
-	struct xfs_log_vec	*log_vector)
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+	struct log		*log,
+	struct xfs_log_vec	*lv,
+	int			*len,
+	int			*diff_iovecs)
 {
-	struct xfs_log_vec	*lv;
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&lv->lv_item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		*len += lv->lv_buf_len - old->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&lv->lv_item->li_cil));
+
+		*len += lv->lv_buf_len;
+		*diff_iovecs += lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
 
-	for (lv = log_vector; lv; ) {
-		struct xfs_log_vec *next = lv->lv_next;
-		kmem_free(lv->lv_buf);
-		kmem_free(lv);
-		lv = next;
 	}
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	/*
+	 * If this is the first time the item is being committed to the
+	 * CIL, store the sequence number on the log item so we can
+	 * tell in future commits whether this is the first checkpoint
+	 * the item is being committed into.
+	 */
+	if (!lv->lv_item->li_seq)
+		lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
 }
 
 /*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
  */
-int
-xfs_log_commit_cil(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
+static void
+xlog_cil_insert_items(
+	struct log		*log,
 	struct xfs_log_vec	*log_vector,
-	xfs_lsn_t		*commit_lsn,
-	int			flags)
+	struct xlog_ticket	*ticket)
 {
-	struct log		*log = mp->m_log;
-	int			log_flags = 0;
-	int			push = 0;
-
-	if (flags & XFS_TRANS_RELEASE_LOG_RES)
-		log_flags = XFS_LOG_REL_PERM_RESERV;
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	struct xfs_log_vec	*lv;
+	int			len = 0;
+	int			diff_iovecs = 0;
+	int			iclog_space;
 
-	if (XLOG_FORCED_SHUTDOWN(log)) {
-		xlog_cil_free_logvec(log_vector);
-		return XFS_ERROR(EIO);
-	}
+	ASSERT(log_vector);
 
-	/* lock out background commit */
-	down_read(&log->l_cilp->xc_ctx_lock);
-	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+	/*
+	 * Do all the accounting aggregation and switching of log vectors
+	 * around in a separate loop to the insertion of items into the CIL.
+	 * Then we can do a separate loop to update the CIL within a single
+	 * lock/unlock pair. This reduces the number of round trips on the CIL
+	 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+	 * hold time for the transaction commit.
+	 *
+	 * If this is the first time the item is being placed into the CIL in
+	 * this context, pin it so it can't be written to disk until the CIL is
+	 * flushed to the iclog and the iclog written to disk.
+	 *
+	 * We can do this safely because the context can't checkpoint until we
+	 * are done so it doesn't matter exactly how we update the CIL.
+	 */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
 
-	/* check we didn't blow the reservation */
-	if (tp->t_ticket->t_curr_res < 0)
-		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
 
-	/* attach the transaction to the CIL if it has any busy extents */
-	if (!list_empty(&tp->t_busy)) {
-		spin_lock(&log->l_cilp->xc_cil_lock);
-		list_splice_init(&tp->t_busy,
-					&log->l_cilp->xc_ctx->busy_extents);
-		spin_unlock(&log->l_cilp->xc_cil_lock);
-	}
+	spin_lock(&cil->xc_cil_lock);
 
-	tp->t_commit_lsn = *commit_lsn;
-	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-	xfs_trans_unreserve_and_mod_sb(tp);
+	/* move the items to the tail of the CIL */
+	for (lv = log_vector; lv; lv = lv->lv_next)
+		list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
 
-	/* check for background commit before unlock */
-	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-		push = 1;
-	up_read(&log->l_cilp->xc_ctx_lock);
+	ctx->nvecs += diff_iovecs;
 
 	/*
-	 * We need to push CIL every so often so we don't cache more than we
-	 * can fit in the log. The limit really is that a checkpoint can't be
-	 * more than half the log (the current checkpoint is not allowed to
-	 * overwrite the previous checkpoint), but commit latency and memory
-	 * usage limit this to a smaller size in most cases.
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
 	 */
-	if (push)
-		xlog_cil_push(log, 0);
-	return 0;
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
+}
+
+static void
+xlog_cil_free_logvec(
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
+		kmem_free(lv->lv_buf);
+		kmem_free(lv);
+		lv = next;
+	}
 }
 
 /*
@@ -429,13 +383,23 @@ xlog_cil_committed(
 }
 
 /*
- * Push the Committed Item List to the log. If the push_now flag is not set,
- * then it is a background flush and so we can chose to ignore it.
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
  */
-int
+STATIC int
 xlog_cil_push(
 	struct log		*log,
-	int			push_now)
+	xfs_lsn_t		push_seq)
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_log_vec	*lv;
@@ -455,12 +419,20 @@ xlog_cil_push(
 	if (!cil)
 		return 0;
 
+	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+
 	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
-	/* lock out transaction commit, but don't block on background push */
+	/*
+	 * Lock out transaction commit, but don't block for background pushes
+	 * unless we are well over the CIL space limit. See the definition of
+	 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+	 * used here.
+	 */
 	if (!down_write_trylock(&cil->xc_ctx_lock)) {
-		if (!push_now)
+		if (!push_seq &&
+		    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
 			goto out_free_ticket;
 		down_write(&cil->xc_ctx_lock);
 	}
@@ -471,7 +443,11 @@ xlog_cil_push(
 		goto out_skip;
 
 	/* check for spurious background flush */
-	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		goto out_skip;
+
+	/* check for a previously pushed seqeunce */
+	if (push_seq && push_seq < cil->xc_ctx->sequence)
 		goto out_skip;
 
 	/*
@@ -517,6 +493,13 @@ xlog_cil_push(
 	cil->xc_ctx = new_ctx;
 
 	/*
+	 * mirror the new sequence into the cil structure so that we can do
+	 * unlocked checks against the current sequence in log forces without
+	 * risking deferencing a freed context pointer.
+	 */
+	cil->xc_current_sequence = new_ctx->sequence;
+
+	/*
 	 * The switch is now done, so we can drop the context lock and move out
 	 * of a shared context. We can't just go straight to the commit record,
 	 * though - we need to synchronise with previous and future commits so
@@ -554,7 +537,7 @@ xlog_cil_push(
 	thdr.th_type = XFS_TRANS_CHECKPOINT;
 	thdr.th_tid = tic->t_tid;
 	thdr.th_num_items = num_iovecs;
-	lhdr.i_addr = (xfs_caddr_t)&thdr;
+	lhdr.i_addr = &thdr;
 	lhdr.i_len = sizeof(xfs_trans_header_t);
 	lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
 	tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
@@ -628,6 +611,105 @@ out_abort:
 }
 
 /*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct log		*log = mp->m_log;
+	int			log_flags = 0;
+	int			push = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		xlog_cil_free_logvec(log_vector);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * do all the hard work of formatting items (including memory
+	 * allocation) outside the CIL context lock. This prevents stalling CIL
+	 * pushes when we are low on memory and a transaction commit spends a
+	 * lot of time in memory reclaim.
+	 */
+	xlog_cil_format_items(log, log_vector);
+
+	/* lock out background commit */
+	down_read(&log->l_cilp->xc_ctx_lock);
+	if (commit_lsn)
+		*commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+	xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy)) {
+		spin_lock(&log->l_cilp->xc_cil_lock);
+		list_splice_init(&tp->t_busy,
+					&log->l_cilp->xc_ctx->busy_extents);
+		spin_unlock(&log->l_cilp->xc_cil_lock);
+	}
+
+	tp->t_commit_lsn = *commit_lsn;
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
+
+	/* check for background commit before unlock */
+	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+		push = 1;
+
+	up_read(&log->l_cilp->xc_ctx_lock);
+
+	/*
+	 * We need to push CIL every so often so we don't cache more than we
+	 * can fit in the log. The limit really is that a checkpoint can't be
+	 * more than half the log (the current checkpoint is not allowed to
+	 * overwrite the previous checkpoint), but commit latency and memory
+	 * usage limit this to a smaller size in most cases.
+	 */
+	if (push)
+		xlog_cil_push(log, 0);
+	return 0;
+}
+
+/*
  * Conditionally push the CIL based on the sequence passed in.
  *
  * We only need to push if we haven't already pushed the sequence
@@ -641,39 +723,34 @@ out_abort:
  * commit lsn is there. It'll be empty, so this is broken for now.
  */
 xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
 	struct log	*log,
-	xfs_lsn_t	push_seq)
+	xfs_lsn_t	sequence)
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_cil_ctx	*ctx;
 	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
 
-restart:
-	down_write(&cil->xc_ctx_lock);
-	ASSERT(push_seq <= cil->xc_ctx->sequence);
-
-	/* check to see if we need to force out the current context */
-	if (push_seq == cil->xc_ctx->sequence) {
-		up_write(&cil->xc_ctx_lock);
-		xlog_cil_push(log, 1);
-		goto restart;
-	}
+	ASSERT(sequence <= cil->xc_current_sequence);
+
+	/*
+	 * check to see if we need to force out the current context.
+	 * xlog_cil_push() handles racing pushes for the same sequence,
+	 * so no need to deal with it here.
+	 */
+	if (sequence == cil->xc_current_sequence)
+		xlog_cil_push(log, sequence);
 
 	/*
 	 * See if we can find a previous sequence still committing.
-	 * We can drop the flush lock as soon as we have the cil lock
-	 * because we are now only comparing contexts protected by
-	 * the cil lock.
-	 *
 	 * We need to wait for all previous sequence commits to complete
 	 * before allowing the force of push_seq to go ahead. Hence block
 	 * on commits for those as well.
 	 */
+restart:
 	spin_lock(&cil->xc_cil_lock);
-	up_write(&cil->xc_ctx_lock);
 	list_for_each_entry(ctx, &cil->xc_committing, committing) {
-		if (ctx->sequence > push_seq)
+		if (ctx->sequence > sequence)
 			continue;
 		if (!ctx->commit_lsn) {
 			/*
@@ -683,7 +760,7 @@ restart:
 			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
 			goto restart;
 		}
-		if (ctx->sequence != push_seq)
+		if (ctx->sequence != sequence)
 			continue;
 		/* found it! */
 		commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965..edcdfe01617 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
 	struct rw_semaphore	xc_ctx_lock;
 	struct list_head	xc_committing;
 	sv_t			xc_commit_wait;
+	xfs_lsn_t		xc_current_sequence;
 };
 
 /*
- * The amount of log space we should the CIL to aggregate is difficult to size.
- * Whatever we chose we have to make we can get a reservation for the log space
- * effectively, that it is large enough to capture sufficient relogging to
- * reduce log buffer IO significantly, but it is not too large for the log or
- * induces too much latency when writing out through the iclogs. We track both
- * space consumed and the number of vectors in the checkpoint context, so we
- * need to decide which to use for limiting.
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
  *
  * Every log buffer we write out during a push needs a header reserved, which
  * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
  * checkpoint transaction ticket is specific to the checkpoint context, rather
  * than the CIL itself.
  *
- * With dynamic reservations, we can basically make up arbitrary limits for the
- * checkpoint size so long as they don't violate any other size rules.  Hence
- * the initial maximum size for the checkpoint transaction will be set to a
- * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
- * right now based on the latency of writing out a large amount of data through
- * the circular iclog buffers.
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that.  Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
  */
-
-#define XLOG_CIL_SPACE_LIMIT(log)	\
-	(min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+#define XLOG_CIL_SPACE_LIMIT(log)	(log->l_logsize >> 3)
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)	(3 * (log->l_logsize >> 4))
 
 /*
  * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int	xlog_cil_init(struct log *log);
 void	xlog_cil_init_post_recovery(struct log *log);
 void	xlog_cil_destroy(struct log *log);
 
-int	xlog_cil_push(struct log *log, int push_now);
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct log *log)
+{
+	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 
 /*
  * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9ac5cfab27b..966d3f97458 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -111,7 +107,8 @@ xlog_get_bp(
 		nbblks += log->l_sectBBsize;
 	nbblks = round_up(nbblks, log->l_sectBBsize);
 
-	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+	return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+					BBTOB(nbblks), 0);
 }
 
 STATIC void
@@ -171,7 +168,7 @@ xlog_bread_noalign(
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 
 	xfsbdstrat(log->l_mp, bp);
-	error = xfs_iowait(bp);
+	error = xfs_buf_iowait(bp);
 	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
@@ -325,12 +322,13 @@ xlog_recover_iodone(
 		 * this during recovery. One strike!
 		 */
 		xfs_ioerror_alert("xlog_recover_iodone",
-				  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+					bp->b_target->bt_mount, bp,
+					XFS_BUF_ADDR(bp));
+		xfs_force_shutdown(bp->b_target->bt_mount,
+					SHUTDOWN_META_IO_ERROR);
 	}
-	bp->b_mount = NULL;
 	XFS_BUF_CLR_IODONE_FUNC(bp);
-	xfs_biodone(bp);
+	xfs_buf_ioend(bp, 0);
 }
 
 /*
@@ -1565,9 +1563,7 @@ xlog_recover_reorder_trans(
 
 	list_splice_init(&trans->r_itemq, &sort_list);
 	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
-		xfs_buf_log_format_t	*buf_f;
-
-		buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
 
 		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
@@ -1892,9 +1888,8 @@ xlog_recover_do_inode_buffer(
 		 * current di_next_unlinked field.  Extract its value
 		 * and copy it to the buffer copy.
 		 */
-		logged_nextp = (xfs_agino_t *)
-			       ((char *)(item->ri_buf[item_index].i_addr) +
-				(next_unlinked_offset - reg_buf_offset));
+		logged_nextp = item->ri_buf[item_index].i_addr +
+				next_unlinked_offset - reg_buf_offset;
 		if (unlikely(*logged_nextp == 0)) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1973,8 +1968,7 @@ xlog_recover_do_reg_buffer(
 					item->ri_buf[i].i_len, __func__);
 				goto next;
 			}
-			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
-					       item->ri_buf[i].i_addr,
+			error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 			if (error)
@@ -2187,7 +2181,7 @@ xlog_recover_do_buffer_trans(
 	xlog_recover_item_t	*item,
 	int			pass)
 {
-	xfs_buf_log_format_t	*buf_f;
+	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	int			error;
@@ -2197,8 +2191,6 @@ xlog_recover_do_buffer_trans(
 	ushort			flags;
 	uint			buf_flags;
 
-	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
-
 	if (pass == XLOG_RECOVER_PASS1) {
 		/*
 		 * In this pass we're only looking for buf items
@@ -2285,8 +2277,7 @@ xlog_recover_do_buffer_trans(
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	} else {
-		ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-		bp->b_mount = mp;
+		ASSERT(bp->b_target->bt_mount == mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	}
@@ -2319,10 +2310,9 @@ xlog_recover_do_inode_trans(
 	}
 
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
-		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+		in_f = item->ri_buf[0].i_addr;
 	} else {
-		in_f = (xfs_inode_log_format_t *)kmem_alloc(
-			sizeof(xfs_inode_log_format_t), KM_SLEEP);
+		in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
@@ -2370,7 +2360,7 @@ xlog_recover_do_inode_trans(
 		error = EFSCORRUPTED;
 		goto error;
 	}
-	dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
+	dicp = item->ri_buf[1].i_addr;
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2461,7 +2451,7 @@ xlog_recover_do_inode_trans(
 	}
 
 	/* The core is in in-core format */
-	xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+	xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
 
 	/* the rest is in on-disk format */
 	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2551,8 +2541,7 @@ xlog_recover_do_inode_trans(
 	}
 
 write_inode_buffer:
-	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-	bp->b_mount = mp;
+	ASSERT(bp->b_target->bt_mount == mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 error:
@@ -2578,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
 		return (0);
 	}
 
-	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+	qoff_f = item->ri_buf[0].i_addr;
 	ASSERT(qoff_f);
 
 	/*
@@ -2622,9 +2611,8 @@ xlog_recover_do_dquot_trans(
 	if (mp->m_qflags == 0)
 		return (0);
 
-	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-
-	if (item->ri_buf[1].i_addr == NULL) {
+	recddq = item->ri_buf[1].i_addr;
+	if (recddq == NULL) {
 		cmn_err(CE_ALERT,
 			"XFS: NULL dquot in %s.", __func__);
 		return XFS_ERROR(EIO);
@@ -2654,7 +2642,7 @@ xlog_recover_do_dquot_trans(
 	 * The other possibility, of course, is that the quota subsystem was
 	 * removed since the last mount - ENOSYS.
 	 */
-	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+	dq_f = item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
 	if ((error = xfs_qm_dqcheck(recddq,
 			   dq_f->qlf_id,
@@ -2690,8 +2678,7 @@ xlog_recover_do_dquot_trans(
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
 
 	ASSERT(dq_f->qlf_size == 2);
-	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-	bp->b_mount = mp;
+	ASSERT(bp->b_target->bt_mount == mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 
@@ -2721,7 +2708,7 @@ xlog_recover_do_efi_trans(
 		return 0;
 	}
 
-	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+	efi_formatp = item->ri_buf[0].i_addr;
 
 	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2767,7 +2754,7 @@ xlog_recover_do_efd_trans(
 		return;
 	}
 
-	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+	efd_formatp = item->ri_buf[0].i_addr;
 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
 	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
@@ -3829,7 +3816,7 @@ xlog_do_recover(
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
-	error = xfs_iowait(bp);
+	error = xfs_buf_iowait(bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 69f62d8b281..b1498ab5a39 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -55,16 +52,11 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 						int);
 STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
 						int);
-STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
-						int64_t, int);
 STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-
 #else
 
 #define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
-#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
-
 #endif
 
 static const struct {
@@ -202,6 +194,8 @@ xfs_uuid_unmount(
 
 /*
  * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
  */
 struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -209,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 	struct xfs_perag	*pag;
 	int			ref = 0;
 
-	spin_lock(&mp->m_perag_lock);
+	rcu_read_lock();
 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
 	if (pag) {
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
-		/* catch leaks in the positive direction during testing */
-		ASSERT(atomic_read(&pag->pag_ref) < 1000);
 		ref = atomic_inc_return(&pag->pag_ref);
 	}
-	spin_unlock(&mp->m_perag_lock);
+	rcu_read_unlock();
 	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
 	return pag;
 }
 
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		first,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			found;
+	int			ref;
+
+	rcu_read_lock();
+	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+					(void **)&pag, first, 1, tag);
+	if (found <= 0) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	ref = atomic_inc_return(&pag->pag_ref);
+	rcu_read_unlock();
+	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+	return pag;
+}
+
 void
 xfs_perag_put(struct xfs_perag *pag)
 {
@@ -232,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
 	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 }
 
+STATIC void
+__xfs_free_perag(
+	struct rcu_head	*head)
+{
+	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+	ASSERT(atomic_read(&pag->pag_ref) == 0);
+	kmem_free(pag);
+}
+
 /*
- * Free up the resources associated with a mount structure.  Assume that
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
+ * Free up the per-ag resources associated with the mount structure.
  */
 STATIC void
 xfs_free_perag(
@@ -247,10 +273,9 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
-		ASSERT(pag);
-		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
-		kmem_free(pag);
+		ASSERT(pag);
+		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
 
@@ -447,7 +472,10 @@ xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		rwlock_init(&pag->pag_ici_lock);
+		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		spin_lock_init(&pag->pag_buf_lock);
+		pag->pag_buf_tree = RB_ROOT;
 
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
@@ -642,7 +670,6 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
 	unsigned int	sector_size;
-	unsigned int	extra_flags;
 	xfs_buf_t	*bp;
 	int		error;
 
@@ -655,28 +682,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
-	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
-			  extra_flags);
-	if (!bp || XFS_BUF_ISERROR(bp)) {
-		xfs_fs_mount_cmn_err(flags, "SB read failed");
-		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-		goto fail;
+reread:
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					XFS_SB_DADDR, sector_size, 0);
+	if (!bp) {
+		xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+		return EIO;
 	}
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 
 	/*
 	 * Initialize the mount structure from the superblock.
 	 * But first do some basic consistency checking.
 	 */
 	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
-
 	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
 	if (error) {
 		xfs_fs_mount_cmn_err(flags, "SB validate failed");
-		goto fail;
+		goto release_buf;
 	}
 
 	/*
@@ -687,7 +710,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 			"device supports only %u byte sectors (not %u)",
 			sector_size, mp->m_sb.sb_sectsize);
 		error = ENOSYS;
-		goto fail;
+		goto release_buf;
 	}
 
 	/*
@@ -695,33 +718,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * re-read the superblock so the buffer is correctly sized.
 	 */
 	if (sector_size < mp->m_sb.sb_sectsize) {
-		XFS_BUF_UNMANAGE(bp);
 		xfs_buf_relse(bp);
 		sector_size = mp->m_sb.sb_sectsize;
-		bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
-				  BTOBB(sector_size), extra_flags);
-		if (!bp || XFS_BUF_ISERROR(bp)) {
-			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-			goto fail;
-		}
-		ASSERT(XFS_BUF_ISBUSY(bp));
-		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		goto reread;
 	}
 
 	/* Initialize per-cpu counters */
 	xfs_icsb_reinit_counters(mp);
 
 	mp->m_sb_bp = bp;
-	xfs_buf_relse(bp);
-	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+	xfs_buf_unlock(bp);
 	return 0;
 
- fail:
-	if (bp) {
-		XFS_BUF_UNMANAGE(bp);
-		xfs_buf_relse(bp);
-	}
+release_buf:
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -994,42 +1004,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
 	xfs_buf_t	*bp;
 	xfs_daddr_t	d;
-	int		error;
 
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-		cmn_err(CE_WARN, "XFS: size check 1 failed");
+		cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			     d - XFS_FSS_TO_BB(mp, 1),
-			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (!error) {
-		xfs_buf_relse(bp);
-	} else {
-		cmn_err(CE_WARN, "XFS: size check 2 failed");
-		if (error == ENOSPC)
-			error = XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: last sector read failed");
+		return EIO;
 	}
+	xfs_buf_relse(bp);
 
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			cmn_err(CE_WARN, "XFS: log size mismatch detected");
 			return XFS_ERROR(EFBIG);
 		}
-		error = xfs_read_buf(mp, mp->m_logdev_targp,
-				     d - XFS_FSB_TO_BB(mp, 1),
-				     XFS_FSB_TO_BB(mp, 1), 0, &bp);
-		if (!error) {
-			xfs_buf_relse(bp);
-		} else {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
-			if (error == ENOSPC)
-				error = XFS_ERROR(EFBIG);
-			return error;
+		bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+		if (!bp) {
+			cmn_err(CE_WARN, "XFS: log device read failed");
+			return EIO;
 		}
+		xfs_buf_relse(bp);
 	}
 	return 0;
 }
@@ -1604,7 +1607,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
 		xfsbdstrat(mp, sbp);
-		error = xfs_iowait(sbp);
+		error = xfs_buf_iowait(sbp);
 		if (error)
 			xfs_ioerror_alert("xfs_unmountfs_writesb",
 					  mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1835,135 +1838,72 @@ xfs_mod_incore_sb_unlocked(
  */
 int
 xfs_mod_incore_sb(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field,
-	int64_t		delta,
-	int		rsvd)
+	struct xfs_mount	*mp,
+	xfs_sb_field_t		field,
+	int64_t			delta,
+	int			rsvd)
 {
-	int	status;
+	int			status;
 
-	/* check for per-cpu counters */
-	switch (field) {
 #ifdef HAVE_PERCPU_SB
-	case XFS_SBS_ICOUNT:
-	case XFS_SBS_IFREE:
-	case XFS_SBS_FDBLOCKS:
-		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-			status = xfs_icsb_modify_counters(mp, field,
-							delta, rsvd);
-			break;
-		}
-		/* FALLTHROUGH */
+	ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
 #endif
-	default:
-		spin_lock(&mp->m_sb_lock);
-		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-		spin_unlock(&mp->m_sb_lock);
-		break;
-	}
+	spin_lock(&mp->m_sb_lock);
+	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	spin_unlock(&mp->m_sb_lock);
 
 	return status;
 }
 
 /*
- * xfs_mod_incore_sb_batch() is used to change more than one field
- * in the in-core superblock structure at a time.  This modification
- * is protected by a lock internal to this module.  The fields and
- * changes to those fields are specified in the array of xfs_mod_sb
- * structures passed in.
+ * Change more than one field in the in-core superblock structure at a time.
  *
- * Either all of the specified deltas will be applied or none of
- * them will.  If any modified field dips below 0, then all modifications
- * will be backed out and EINVAL will be returned.
+ * The fields and changes to those fields are specified in the array of
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
  */
 int
-xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+xfs_mod_incore_sb_batch(
+	struct xfs_mount	*mp,
+	xfs_mod_sb_t		*msb,
+	uint			nmsb,
+	int			rsvd)
 {
-	int		status=0;
-	xfs_mod_sb_t	*msbp;
+	xfs_mod_sb_t		*msbp = &msb[0];
+	int			error = 0;
 
 	/*
-	 * Loop through the array of mod structures and apply each
-	 * individually.  If any fail, then back out all those
-	 * which have already been applied.  Do all of this within
-	 * the scope of the m_sb_lock so that all of the changes will
-	 * be atomic.
+	 * Loop through the array of mod structures and apply each individually.
+	 * If any fail, then back out all those which have already been applied.
+	 * Do all of this within the scope of the m_sb_lock so that all of the
+	 * changes will be atomic.
 	 */
 	spin_lock(&mp->m_sb_lock);
-	msbp = &msb[0];
 	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
-		/*
-		 * Apply the delta at index n.  If it fails, break
-		 * from the loop so we'll fall into the undo loop
-		 * below.
-		 */
-		switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-		case XFS_SBS_ICOUNT:
-		case XFS_SBS_IFREE:
-		case XFS_SBS_FDBLOCKS:
-			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-				spin_unlock(&mp->m_sb_lock);
-				status = xfs_icsb_modify_counters(mp,
-							msbp->msb_field,
-							msbp->msb_delta, rsvd);
-				spin_lock(&mp->m_sb_lock);
-				break;
-			}
-			/* FALLTHROUGH */
-#endif
-		default:
-			status = xfs_mod_incore_sb_unlocked(mp,
-						msbp->msb_field,
-						msbp->msb_delta, rsvd);
-			break;
-		}
+		ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+		       msbp->msb_field > XFS_SBS_FDBLOCKS);
 
-		if (status != 0) {
-			break;
-		}
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   msbp->msb_delta, rsvd);
+		if (error)
+			goto unwind;
 	}
+	spin_unlock(&mp->m_sb_lock);
+	return 0;
 
-	/*
-	 * If we didn't complete the loop above, then back out
-	 * any changes made to the superblock.  If you add code
-	 * between the loop above and here, make sure that you
-	 * preserve the value of status. Loop back until
-	 * we step below the beginning of the array.  Make sure
-	 * we don't touch anything back there.
-	 */
-	if (status != 0) {
-		msbp--;
-		while (msbp >= msb) {
-			switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-			case XFS_SBS_ICOUNT:
-			case XFS_SBS_IFREE:
-			case XFS_SBS_FDBLOCKS:
-				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-					spin_unlock(&mp->m_sb_lock);
-					status = xfs_icsb_modify_counters(mp,
-							msbp->msb_field,
-							-(msbp->msb_delta),
-							rsvd);
-					spin_lock(&mp->m_sb_lock);
-					break;
-				}
-				/* FALLTHROUGH */
-#endif
-			default:
-				status = xfs_mod_incore_sb_unlocked(mp,
-							msbp->msb_field,
-							-(msbp->msb_delta),
-							rsvd);
-				break;
-			}
-			ASSERT(status == 0);
-			msbp--;
-		}
+unwind:
+	while (--msbp >= msb) {
+		error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						   -msbp->msb_delta, rsvd);
+		ASSERT(error == 0);
 	}
 	spin_unlock(&mp->m_sb_lock);
-	return status;
+	return error;
 }
 
 /*
@@ -2001,18 +1941,13 @@ xfs_getsb(
  */
 void
 xfs_freesb(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_buf_t	*bp;
+	struct xfs_buf		*bp = mp->m_sb_bp;
 
-	/*
-	 * Use xfs_getsb() so that the buffer will be locked
-	 * when we call xfs_buf_relse().
-	 */
-	bp = xfs_getsb(mp, 0);
-	XFS_BUF_UNMANAGE(bp);
-	xfs_buf_relse(bp);
+	xfs_buf_lock(bp);
 	mp->m_sb_bp = NULL;
+	xfs_buf_relse(bp);
 }
 
 /*
@@ -2499,7 +2434,7 @@ xfs_icsb_balance_counter(
 	spin_unlock(&mp->m_sb_lock);
 }
 
-STATIC int
+int
 xfs_icsb_modify_counters(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5761087ee8e..5861b498074 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
 
 #include "xfs_sync.h"
 
-struct cred;
 struct log;
 struct xfs_mount_args;
 struct xfs_inode;
@@ -66,65 +65,6 @@ struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
 
-
-/*
- * Prototypes and functions for the Data Migration subsystem.
- */
-
-typedef int	(*xfs_send_data_t)(int, struct xfs_inode *,
-			xfs_off_t, size_t, int, int *);
-typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
-typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-			struct xfs_inode *, dm_right_t,
-			struct xfs_inode *, dm_right_t,
-			const unsigned char *, const unsigned char *,
-			mode_t, int, int);
-typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
-			char *, char *);
-typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
-			dm_right_t, mode_t, int, int);
-
-typedef struct xfs_dmops {
-	xfs_send_data_t		xfs_send_data;
-	xfs_send_mmap_t		xfs_send_mmap;
-	xfs_send_destroy_t	xfs_send_destroy;
-	xfs_send_namesp_t	xfs_send_namesp;
-	xfs_send_mount_t	xfs_send_mount;
-	xfs_send_unmount_t	xfs_send_unmount;
-} xfs_dmops_t;
-
-#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
-	(((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
-
-#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
-	(*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
-#define XFS_SEND_MMAP(mp, vma,fl) \
-	(*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, ip,right) \
-	(*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
-#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
-	(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_MOUNT(mp,right,path,name) \
-	(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_PREUNMOUNT(mp) \
-do { \
-	if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-		(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
-			(mp)->m_rootip, DM_RIGHT_NULL, \
-			(mp)->m_rootip, DM_RIGHT_NULL, \
-			NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-	} \
-} while (0)
-#define XFS_SEND_UNMOUNT(mp) \
-do { \
-	if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-		(*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
-			DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-	} \
-} while (0)
-
-
 #ifdef HAVE_PERCPU_SB
 
 /*
@@ -150,6 +90,8 @@ extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+						int64_t, int);
 
 #else
 #define xfs_icsb_init_counters(mp)		(0)
@@ -157,6 +99,8 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_reinit_counters(mp)		do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+	xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 
 typedef struct xfs_mount {
@@ -241,8 +185,6 @@ typedef struct xfs_mount {
 	uint			m_chsize;	/* size of next field */
 	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
 						 * hash table */
-	struct xfs_dmops	*m_dm_ops;	/* vector of DMI ops */
-	struct xfs_qmops	*m_qm_ops;	/* vector of XQM ops */
 	atomic_t		m_active_trans;	/* number trans frozen */
 #ifdef HAVE_PERCPU_SB
 	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
@@ -269,7 +211,6 @@ typedef struct xfs_mount {
 						   must be synchronous except
 						   for space allocations */
 #define XFS_MOUNT_DELAYLOG	(1ULL << 1)	/* delayed logging is enabled */
-#define XFS_MOUNT_DMAPI		(1ULL << 2)	/* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
 						   operations, typically for
@@ -282,8 +223,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID		(1ULL << 9)	/* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE	(1ULL << 12)	/* set default i/o size */
-#define XFS_MOUNT_OSYNCISOSYNC	(1ULL << 13)	/* o_sync is REALLY o_sync */
-						/* osyncisdsync is now default*/
 #define XFS_MOUNT_32BITINODES	(1ULL << 14)	/* do not create inodes above
 						 * 32 bits in size */
 #define XFS_MOUNT_SMALL_INUMS	(1ULL << 15)	/* users wants 32bit inodes */
@@ -296,8 +235,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DIRSYNC	(1ULL << 21)	/* synchronous directory ops */
 #define XFS_MOUNT_COMPAT_IOSIZE	(1ULL << 22)	/* don't report large preferred
 						 * I/O size in stat() */
-#define XFS_MOUNT_NO_PERCPU_SB	(1ULL << 23)	/* don't use per-cpu superblock
-						   counters */
 #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
 						   allocator */
 #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
@@ -391,6 +328,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
  * perag get/put wrappers for ref counting
  */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+					int tag);
 void	xfs_perag_put(struct xfs_perag *pag);
 
 /*
@@ -440,11 +379,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
-extern int	xfs_dmops_get(struct xfs_mount *);
-extern void	xfs_dmops_put(struct xfs_mount *);
-
-extern struct xfs_dmops xfs_dmcore_xfs;
-
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb51..00000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_REFCACHE_H__
-#define __XFS_REFCACHE_H__
-
-#ifdef HAVE_REFCACHE
-/*
- * Maximum size (in inodes) for the NFS reference cache
- */
-#define XFS_REFCACHE_SIZE_MAX	512
-
-struct xfs_inode;
-struct xfs_mount;
-
-extern void xfs_refcache_insert(struct xfs_inode *);
-extern void xfs_refcache_purge_ip(struct xfs_inode *);
-extern void xfs_refcache_purge_mp(struct xfs_mount *);
-extern void xfs_refcache_purge_some(struct xfs_mount *);
-extern void xfs_refcache_resize(int);
-extern void xfs_refcache_destroy(void);
-
-extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
-
-#else
-
-#define xfs_refcache_insert(ip)		do { } while (0)
-#define xfs_refcache_purge_ip(ip)	do { } while (0)
-#define xfs_refcache_purge_mp(mp)	do { } while (0)
-#define xfs_refcache_purge_some(mp)	do { } while (0)
-#define xfs_refcache_resize(size)	do { } while (0)
-#define xfs_refcache_destroy()		do { } while (0)
-
-#define xfs_refcache_iunlock(ip, flags)	xfs_iunlock(ip, flags)
-
-#endif
-
-#endif	/* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b81..d2af0a8381a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
 	int		spaceres;
 	int		num_inodes;
 
-	xfs_itrace_entry(src_dp);
-	xfs_itrace_entry(target_dp);
-
-	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
-	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-					src_dp, DM_RIGHT_NULL,
-					target_dp, DM_RIGHT_NULL,
-					src_name->name, target_name->name,
-					0, 0, 0);
-		if (error)
-			return error;
-	}
-	/* Return through std_return after this point. */
+	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
 
 	new_parent = (src_dp != target_dp);
 	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
 	/*
 	 * Join all the inodes to the transaction. From this point on,
 	 * we can rely on either trans_commit or trans_cancel to unlock
-	 * them.  Note that we need to add a vnode reference to the
-	 * directories since trans_commit & trans_cancel will decrement
-	 * them when they unlock the inodes.  Also, we need to be careful
-	 * not to add an inode to the transaction more than once.
+	 * them.
 	 */
-	IHOLD(src_dp);
-	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
-
-	if (new_parent) {
-		IHOLD(target_dp);
-		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
-	}
-
-	IHOLD(src_ip);
-	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-
-	if (target_ip) {
-		IHOLD(target_ip);
-		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-	}
+	xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent)
+		xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+	if (target_ip)
+		xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we are using project inheritance, we only allow renames
@@ -211,7 +183,7 @@ xfs_rename(
 	 * tree quota mechanism would be circumvented.
 	 */
 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
 		error = XFS_ERROR(EXDEV);
 		goto error_return;
 	}
@@ -239,7 +211,9 @@ xfs_rename(
 			goto error_return;
 		if (error)
 			goto abort_return;
-		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		if (new_parent && src_is_directory) {
 			error = xfs_bumplink(tp, target_dp);
@@ -277,7 +251,9 @@ xfs_rename(
 					&first_block, &free_list, spaceres);
 		if (error)
 			goto abort_return;
-		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		xfs_trans_ichgtime(tp, target_dp,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		/*
 		 * Decrement the link count on the target since the target
@@ -320,7 +296,7 @@ xfs_rename(
 	 * inode isn't really being changed, but old unix file systems did
 	 * it and some incremental backup programs won't work without it.
 	 */
-	xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
 
 	/*
 	 * Adjust the link count on src_dp.  This is necessary when
@@ -343,7 +319,7 @@ xfs_rename(
 	if (error)
 		goto abort_return;
 
-	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
 	if (new_parent)
 		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
@@ -369,26 +345,13 @@ xfs_rename(
 	 * trans_commit will unlock src_ip, target_ip & decrement
 	 * the vnode references.
 	 */
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
-	/* Fall through to std_return with error = 0 or errno from
-	 * xfs_trans_commit	 */
-std_return:
-	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
-	    DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
-		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-					src_dp, DM_RIGHT_NULL,
-					target_dp, DM_RIGHT_NULL,
-					src_name->name, target_name->name,
-					0, error, 0);
-	}
-	return error;
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
  abort_return:
 	cancel_flags |= XFS_TRANS_ABORT;
-	/* FALLTHROUGH */
  error_return:
 	xfs_bmap_cancel(&free_list);
 	xfs_trans_cancel(tp, cancel_flags);
-	goto std_return;
+ std_return:
+	return error;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a2d32ce335a..12a19138531 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -46,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 
 
 /*
@@ -129,7 +123,7 @@ xfs_growfs_rt_alloc(
 		cancelflags |= XFS_TRANS_ABORT;
 		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
 			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-			resblks, &map, &nmap, &flist, NULL);
+			resblks, &map, &nmap, &flist);
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
@@ -1890,13 +1884,13 @@ xfs_growfs_rt(
 	/*
 	 * Read in the last block of the device, make sure it exists.
 	 */
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-			XFS_FSB_TO_BB(mp, nrblocks - 1),
-			XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
+
 	/*
 	 * Calculate new parameters.  These are the final values to be reached.
 	 */
@@ -2222,7 +2216,6 @@ xfs_rtmount_init(
 {
 	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
 	xfs_daddr_t	d;	/* address of last block of subvolume */
-	int		error;	/* error return value */
 	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
 
 	sbp = &mp->m_sb;
@@ -2249,15 +2242,12 @@ xfs_rtmount_init(
 			(unsigned long long) mp->m_sb.sb_rblocks);
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-				d - XFS_FSB_TO_BB(mp, 1),
-				XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error) {
-		cmn_err(CE_WARN,
-	"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
-		if (error == ENOSPC)
-			return XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1), 0);
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: realtime device size check failed");
+		return EIO;
 	}
 	xfs_buf_relse(bp);
 	return 0;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a..56861d5daae 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_buf_item.h"
 #include "xfs_rw.h"
-#include "xfs_trace.h"
 
 /*
  * Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c65749..1eb2ba58681 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
 
 #define	XFS_SB_VERSION2_OKREALFBITS	\
 	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
-	 XFS_SB_VERSION2_ATTR2BIT)
+	 XFS_SB_VERSION2_ATTR2BIT	| \
+	 XFS_SB_VERSION2_PROJID32BIT)
 #define	XFS_SB_VERSION2_OKSASHFBITS	\
 	(0)
 #define XFS_SB_VERSION2_OKREALBITS	\
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 		sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
 
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+	return xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
+
 /*
  * end of superblock version macros
  */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 28547dfce03..f6d956b7711 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -47,6 +44,7 @@
 #include "xfs_trace.h"
 
 kmem_zone_t	*xfs_trans_zone;
+kmem_zone_t	*xfs_log_item_desc_zone;
 
 
 /*
@@ -597,8 +595,7 @@ _xfs_trans_alloc(
 	tp->t_magic = XFS_TRANS_MAGIC;
 	tp->t_type = type;
 	tp->t_mountp = mp;
-	tp->t_items_free = XFS_LIC_NUM_SLOTS;
-	xfs_lic_init(&(tp->t_items));
+	INIT_LIST_HEAD(&tp->t_items);
 	INIT_LIST_HEAD(&tp->t_busy);
 	return tp;
 }
@@ -643,8 +640,7 @@ xfs_trans_dup(
 	ntp->t_magic = XFS_TRANS_MAGIC;
 	ntp->t_type = tp->t_type;
 	ntp->t_mountp = tp->t_mountp;
-	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-	xfs_lic_init(&(ntp->t_items));
+	INIT_LIST_HEAD(&ntp->t_items);
 	INIT_LIST_HEAD(&ntp->t_busy);
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -700,7 +696,7 @@ xfs_trans_reserve(
 	 * fail if the count would go below zero.
 	 */
 	if (blocks > 0) {
-		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+		error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					  -((int64_t)blocks), rsvd);
 		if (error != 0) {
 			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -771,7 +767,7 @@ undo_log:
 
 undo_blocks:
 	if (blocks > 0) {
-		(void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+		xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					 (int64_t)blocks, rsvd);
 		tp->t_blk_res = 0;
 	}
@@ -1013,7 +1009,7 @@ void
 xfs_trans_unreserve_and_mod_sb(
 	xfs_trans_t	*tp)
 {
-	xfs_mod_sb_t	msb[14];	/* If you add cases, add entries */
+	xfs_mod_sb_t	msb[9];	/* If you add cases, add entries */
 	xfs_mod_sb_t	*msbp;
 	xfs_mount_t	*mp = tp->t_mountp;
 	/* REFERENCED */
@@ -1021,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
 	int		rsvd;
 	int64_t		blkdelta = 0;
 	int64_t		rtxdelta = 0;
+	int64_t		idelta = 0;
+	int64_t		ifreedelta = 0;
 
 	msbp = msb;
 	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
-	/* calculate free blocks delta */
+	/* calculate deltas */
 	if (tp->t_blk_res > 0)
 		blkdelta = tp->t_blk_res;
-
 	if ((tp->t_fdblocks_delta != 0) &&
 	    (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
 	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
 	        blkdelta += tp->t_fdblocks_delta;
 
-	if (blkdelta != 0) {
-		msbp->msb_field = XFS_SBS_FDBLOCKS;
-		msbp->msb_delta = blkdelta;
-		msbp++;
-	}
-
-	/* calculate free realtime extents delta */
 	if (tp->t_rtx_res > 0)
 		rtxdelta = tp->t_rtx_res;
-
 	if ((tp->t_frextents_delta != 0) &&
 	    (tp->t_flags & XFS_TRANS_SB_DIRTY))
 		rtxdelta += tp->t_frextents_delta;
 
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+		idelta = tp->t_icount_delta;
+		ifreedelta = tp->t_ifree_delta;
+	}
+
+	/* apply the per-cpu counters */
+	if (blkdelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+						 blkdelta, rsvd);
+		if (error)
+			goto out;
+	}
+
+	if (idelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+						 idelta, rsvd);
+		if (error)
+			goto out_undo_fdblocks;
+	}
+
+	if (ifreedelta) {
+		error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+						 ifreedelta, rsvd);
+		if (error)
+			goto out_undo_icount;
+	}
+
+	/* apply remaining deltas */
 	if (rtxdelta != 0) {
 		msbp->msb_field = XFS_SBS_FREXTENTS;
 		msbp->msb_delta = rtxdelta;
 		msbp++;
 	}
 
-	/* apply remaining deltas */
-
-	if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
-	     (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
-		if (tp->t_icount_delta != 0) {
-			msbp->msb_field = XFS_SBS_ICOUNT;
-			msbp->msb_delta = tp->t_icount_delta;
-			msbp++;
-		}
-		if (tp->t_ifree_delta != 0) {
-			msbp->msb_field = XFS_SBS_IFREE;
-			msbp->msb_delta = tp->t_ifree_delta;
-			msbp++;
-		}
-	}
-
 	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
 		if (tp->t_dblocks_delta != 0) {
 			msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1119,7 +1121,125 @@ xfs_trans_unreserve_and_mod_sb(
 	if (msbp > msb) {
 		error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
 			(uint)(msbp - msb), rsvd);
-		ASSERT(error == 0);
+		if (error)
+			goto out_undo_ifreecount;
+	}
+
+	return;
+
+out_undo_ifreecount:
+	if (ifreedelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+	if (idelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+	if (blkdelta)
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+	ASSERT(error = 0);
+	return;
+}
+
+/*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+	struct xfs_trans	*tp,
+	struct xfs_log_item	*lip)
+{
+	struct xfs_log_item_desc *lidp;
+
+	ASSERT(lip->li_mountp = tp->t_mountp);
+	ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+
+	lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+
+	lidp->lid_item = lip;
+	lidp->lid_flags = 0;
+	lidp->lid_size = 0;
+	list_add_tail(&lidp->lid_trans, &tp->t_items);
+
+	lip->li_desc = lidp;
+}
+
+STATIC void
+xfs_trans_free_item_desc(
+	struct xfs_log_item_desc *lidp)
+{
+	list_del_init(&lidp->lid_trans);
+	kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+	struct xfs_log_item	*lip)
+{
+	xfs_trans_free_item_desc(lip->li_desc);
+	lip->li_desc = NULL;
+}
+
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+void
+xfs_trans_free_items(
+	struct xfs_trans	*tp,
+	xfs_lsn_t		commit_lsn,
+	int			flags)
+{
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+		if (flags & XFS_TRANS_ABORT)
+			lip->li_flags |= XFS_LI_ABORTED;
+		IOP_UNLOCK(lip);
+
+		xfs_trans_free_item_desc(lidp);
+	}
+}
+
+/*
+ * Unlock the items associated with a transaction.
+ *
+ * Items which were not logged should be freed.  Those which were logged must
+ * still be tracked so they can be unpinned when the transaction commits.
+ */
+STATIC void
+xfs_trans_unlock_items(
+	struct xfs_trans	*tp,
+	xfs_lsn_t		commit_lsn)
+{
+	struct xfs_log_item_desc *lidp, *next;
+
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+		struct xfs_log_item	*lip = lidp->lid_item;
+
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+		IOP_UNLOCK(lip);
+
+		/*
+		 * Free the descriptor if the item is not dirty
+		 * within this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			xfs_trans_free_item_desc(lidp);
 	}
 }
 
@@ -1134,30 +1254,27 @@ xfs_trans_count_vecs(
 	struct xfs_trans	*tp)
 {
 	int			nvecs;
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_item_desc *lidp;
 
 	nvecs = 1;
-	lidp = xfs_trans_first_item(tp);
-	ASSERT(lidp != NULL);
 
 	/* In the non-debug case we need to start bailing out if we
 	 * didn't find a log_item here, return zero and let trans_commit
 	 * deal with it.
 	 */
-	if (lidp == NULL)
+	if (list_empty(&tp->t_items)) {
+		ASSERT(0);
 		return 0;
+	}
 
-	while (lidp != NULL) {
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		/*
 		 * Skip items which aren't dirty in this transaction.
 		 */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-			lidp = xfs_trans_next_item(tp, lidp);
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
 			continue;
-		}
 		lidp->lid_size = IOP_SIZE(lidp->lid_item);
 		nvecs += lidp->lid_size;
-		lidp = xfs_trans_next_item(tp, lidp);
 	}
 
 	return nvecs;
@@ -1177,7 +1294,7 @@ xfs_trans_fill_vecs(
 	struct xfs_trans	*tp,
 	struct xfs_log_iovec	*log_vector)
 {
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_item_desc *lidp;
 	struct xfs_log_iovec	*vecp;
 	uint			nitems;
 
@@ -1188,14 +1305,11 @@ xfs_trans_fill_vecs(
 	vecp = log_vector + 1;
 
 	nitems = 0;
-	lidp = xfs_trans_first_item(tp);
-	ASSERT(lidp);
-	while (lidp) {
+	ASSERT(!list_empty(&tp->t_items));
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		/* Skip items which aren't dirty in this transaction. */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-			lidp = xfs_trans_next_item(tp, lidp);
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
 			continue;
-		}
 
 		/*
 		 * The item may be marked dirty but not log anything.  This can
@@ -1206,7 +1320,6 @@ xfs_trans_fill_vecs(
 		IOP_FORMAT(lidp->lid_item, vecp);
 		vecp += lidp->lid_size;
 		IOP_PIN(lidp->lid_item);
-		lidp = xfs_trans_next_item(tp, lidp);
 	}
 
 	/*
@@ -1284,7 +1397,7 @@ xfs_trans_item_committed(
 	 * log item flags, if anyone else stales the buffer we do not want to
 	 * pay any attention to it.
 	 */
-	IOP_UNPIN(lip);
+	IOP_UNPIN(lip, 0);
 }
 
 /*
@@ -1298,27 +1411,15 @@ xfs_trans_item_committed(
  */
 STATIC void
 xfs_trans_committed(
-	struct xfs_trans	*tp,
+	void			*arg,
 	int			abortflag)
 {
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_chunk_t	*next_licp;
-
-	/* Call the transaction's completion callback if there is one. */
-	if (tp->t_callback != NULL)
-		tp->t_callback(tp, tp->t_callarg);
+	struct xfs_trans	*tp = arg;
+	struct xfs_log_item_desc *lidp, *next;
 
-	for (lidp = xfs_trans_first_item(tp);
-	     lidp != NULL;
-	     lidp = xfs_trans_next_item(tp, lidp)) {
+	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
 		xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
-	}
-
-	/* free the item chunks, ignoring the embedded chunk */
-	for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
-		next_licp = licp->lic_next;
-		kmem_free(licp);
+		xfs_trans_free_item_desc(lidp);
 	}
 
 	xfs_trans_free(tp);
@@ -1333,16 +1434,14 @@ xfs_trans_uncommit(
 	struct xfs_trans	*tp,
 	uint			flags)
 {
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_item_desc *lidp;
 
-	for (lidp = xfs_trans_first_item(tp);
-	     lidp != NULL;
-	     lidp = xfs_trans_next_item(tp, lidp)) {
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		/*
 		 * Unpin all but those that aren't dirty.
 		 */
 		if (lidp->lid_flags & XFS_LID_DIRTY)
-			IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+			IOP_UNPIN(lidp->lid_item, 1);
 	}
 
 	xfs_trans_unreserve_and_mod_sb(tp);
@@ -1445,7 +1544,7 @@ xfs_trans_commit_iclog(
 	 * running in simulation mode (the log is explicitly turned
 	 * off).
 	 */
-	tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+	tp->t_logcb.cb_func = xfs_trans_committed;
 	tp->t_logcb.cb_arg = tp;
 
 	/*
@@ -1508,33 +1607,28 @@ STATIC struct xfs_log_vec *
 xfs_trans_alloc_log_vecs(
 	xfs_trans_t	*tp)
 {
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_item_desc *lidp;
 	struct xfs_log_vec	*lv = NULL;
 	struct xfs_log_vec	*ret_lv = NULL;
 
-	lidp = xfs_trans_first_item(tp);
 
 	/* Bail out if we didn't find a log item.  */
-	if (!lidp) {
+	if (list_empty(&tp->t_items)) {
 		ASSERT(0);
 		return NULL;
 	}
 
-	while (lidp != NULL) {
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		struct xfs_log_vec *new_lv;
 
 		/* Skip items which aren't dirty in this transaction. */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-			lidp = xfs_trans_next_item(tp, lidp);
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
 			continue;
-		}
 
 		/* Skip items that do not have any vectors for writing */
 		lidp->lid_size = IOP_SIZE(lidp->lid_item);
-		if (!lidp->lid_size) {
-			lidp = xfs_trans_next_item(tp, lidp);
+		if (!lidp->lid_size)
 			continue;
-		}
 
 		new_lv = kmem_zalloc(sizeof(*new_lv) +
 				lidp->lid_size * sizeof(struct xfs_log_iovec),
@@ -1549,7 +1643,6 @@ xfs_trans_alloc_log_vecs(
 		else
 			lv->lv_next = new_lv;
 		lv = new_lv;
-		lidp = xfs_trans_next_item(tp, lidp);
 	}
 
 	return ret_lv;
@@ -1579,9 +1672,6 @@ xfs_trans_commit_cil(
 		return error;
 
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-
-	/* xfs_trans_free_items() unlocks them first */
-	xfs_trans_free_items(tp, *commit_lsn, 0);
 	xfs_trans_free(tp);
 	return 0;
 }
@@ -1708,12 +1798,6 @@ xfs_trans_cancel(
 	int			flags)
 {
 	int			log_flags;
-#ifdef DEBUG
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_item_t		*lip;
-	int			i;
-#endif
 	xfs_mount_t		*mp = tp->t_mountp;
 
 	/*
@@ -1732,21 +1816,11 @@ xfs_trans_cancel(
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	}
 #ifdef DEBUG
-	if (!(flags & XFS_TRANS_ABORT)) {
-		licp = &(tp->t_items);
-		while (licp != NULL) {
-			lidp = licp->lic_descs;
-			for (i = 0; i < licp->lic_unused; i++, lidp++) {
-				if (xfs_lic_isfree(licp, i)) {
-					continue;
-				}
-
-				lip = lidp->lid_item;
-				if (!XFS_FORCED_SHUTDOWN(mp))
-					ASSERT(!(lip->li_type == XFS_LI_EFD));
-			}
-			licp = licp->lic_next;
-		}
+	if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+		struct xfs_log_item_desc *lidp;
+
+		list_for_each_entry(lidp, &tp->t_items, lid_trans)
+			ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
 	}
 #endif
 	xfs_trans_unreserve_and_mod_sb(tp);
@@ -1834,7 +1908,6 @@ xfs_trans_roll(
 	if (error)
 		return error;
 
-	xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(trans, dp);
+	xfs_trans_ijoin(trans, dp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e639e8e9a2a..246286b77a8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -161,105 +161,14 @@ typedef struct xfs_trans_header {
  * the amount of space needed to log the item it describes
  * once we get to commit processing (see xfs_trans_commit()).
  */
-typedef struct xfs_log_item_desc {
+struct xfs_log_item_desc {
 	struct xfs_log_item	*lid_item;
-	ushort		lid_size;
-	unsigned char	lid_flags;
-	unsigned char	lid_index;
-} xfs_log_item_desc_t;
+	ushort			lid_size;
+	unsigned char		lid_flags;
+	struct list_head	lid_trans;
+};
 
 #define XFS_LID_DIRTY		0x1
-#define XFS_LID_PINNED		0x2
-
-/*
- * This structure is used to maintain a chunk list of log_item_desc
- * structures. The free field is a bitmask indicating which descriptors
- * in this chunk's array are free.  The unused field is the first value
- * not used since this chunk was allocated.
- */
-#define	XFS_LIC_NUM_SLOTS	15
-typedef struct xfs_log_item_chunk {
-	struct xfs_log_item_chunk	*lic_next;
-	ushort				lic_free;
-	ushort				lic_unused;
-	xfs_log_item_desc_t		lic_descs[XFS_LIC_NUM_SLOTS];
-} xfs_log_item_chunk_t;
-
-#define	XFS_LIC_MAX_SLOT	(XFS_LIC_NUM_SLOTS - 1)
-#define	XFS_LIC_FREEMASK	((1 << XFS_LIC_NUM_SLOTS) - 1)
-
-
-/*
- * Initialize the given chunk.  Set the chunk's free descriptor mask
- * to indicate that all descriptors are free.  The caller gets to set
- * lic_unused to the right value (0 matches all free).  The
- * lic_descs.lid_index values are set up as each desc is allocated.
- */
-static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
-{
-	cp->lic_free = XFS_LIC_FREEMASK;
-}
-
-static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-	cp->lic_descs[slot].lid_index = (unsigned char)(slot);
-}
-
-static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
-{
-	return cp->lic_free & XFS_LIC_FREEMASK;
-}
-
-static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
-{
-	cp->lic_free = XFS_LIC_FREEMASK;
-}
-
-static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
-{
-	return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
-}
-
-static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
-{
-	return (cp->lic_free & (1 << slot));
-}
-
-static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
-{
-	cp->lic_free &= ~(1 << slot);
-}
-
-static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
-{
-	cp->lic_free |= 1 << slot;
-}
-
-static inline xfs_log_item_desc_t *
-xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-	return &(cp->lic_descs[slot]);
-}
-
-static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
-{
-	return (uint)dp->lid_index;
-}
-
-/*
- * Calculate the address of a chunk given a descriptor pointer:
- * dp - dp->lid_index give the address of the start of the lic_descs array.
- * From this we subtract the offset of the lic_descs field in a chunk.
- * All of this yields the address of the chunk, which is
- * cast to a chunk pointer.
- */
-static inline xfs_log_item_chunk_t *
-xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
-{
-	return (xfs_log_item_chunk_t*) \
-		(((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
-		(xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
-}
 
 #define	XFS_TRANS_MAGIC		0x5452414E	/* 'TRAN' */
 /*
@@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
  * Values for call flags parameter.
  */
-#define	XFS_TRANS_NOSLEEP		0x1
-#define	XFS_TRANS_WAIT			0x2
 #define	XFS_TRANS_RELEASE_LOG_RES	0x4
 #define	XFS_TRANS_ABORT			0x8
 
@@ -438,8 +345,7 @@ typedef struct xfs_item_ops {
 	uint (*iop_size)(xfs_log_item_t *);
 	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
 	void (*iop_pin)(xfs_log_item_t *);
-	void (*iop_unpin)(xfs_log_item_t *);
-	void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+	void (*iop_unpin)(xfs_log_item_t *, int remove);
 	uint (*iop_trylock)(xfs_log_item_t *);
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -451,8 +357,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)		(*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip)		(*(ip)->li_ops->iop_unpin)(ip)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_UNPIN(ip, remove)	(*(ip)->li_ops->iop_unpin)(ip, remove)
 #define IOP_TRYLOCK(ip)		(*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
 #define IOP_COMMITTED(ip, lsn)	(*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -494,8 +399,6 @@ typedef struct xfs_trans {
 						 * transaction. */
 	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
 	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
-	xfs_trans_callback_t	t_callback;	/* transaction callback */
-	void			*t_callarg;	/* callback arg */
 	unsigned int		t_flags;	/* misc flags */
 	int64_t			t_icount_delta;	/* superblock icount change */
 	int64_t			t_ifree_delta;	/* superblock ifree change */
@@ -516,8 +419,7 @@ typedef struct xfs_trans {
 	int64_t			t_rblocks_delta;/* superblock rblocks change */
 	int64_t			t_rextents_delta;/* superblocks rextents chg */
 	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
-	unsigned int		t_items_free;	/* log item descs free */
-	xfs_log_item_chunk_t	t_items;	/* first log item desc chunk */
+	struct list_head	t_items;	/* log item descriptors */
 	xfs_trans_header_t	t_header;	/* header for in-log trans */
 	struct list_head	t_busy;		/* list of busy extents */
 	unsigned long		t_pflags;	/* saved process flags state */
@@ -569,8 +471,9 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int		xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
 			       xfs_ino_t , uint, uint, struct xfs_inode **);
-void		xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
-void		xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
+void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item	*xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -595,6 +498,7 @@ int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
 
 extern kmem_zone_t	*xfs_trans_zone;
+extern kmem_zone_t	*xfs_log_item_desc_zone;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f724..dc9069568ff 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 63d81a22f4f..c47918c302a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_buf_item.h"
@@ -51,36 +47,17 @@ xfs_trans_buf_item_match(
 	xfs_daddr_t		blkno,
 	int			len)
 {
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_desc_t	*lidp;
-	xfs_buf_log_item_t	*blip;
-	int			i;
+	struct xfs_log_item_desc *lidp;
+	struct xfs_buf_log_item	*blip;
 
 	len = BBTOB(len);
-	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-		if (xfs_lic_are_all_free(licp)) {
-			ASSERT(licp == &tp->t_items);
-			ASSERT(licp->lic_next == NULL);
-			return NULL;
-		}
-
-		for (i = 0; i < licp->lic_unused; i++) {
-			/*
-			 * Skip unoccupied slots.
-			 */
-			if (xfs_lic_isfree(licp, i))
-				continue;
-
-			lidp = xfs_lic_slot(licp, i);
-			blip = (xfs_buf_log_item_t *)lidp->lid_item;
-			if (blip->bli_item.li_type != XFS_LI_BUF)
-				continue;
-
-			if (XFS_BUF_TARGET(blip->bli_buf) == target &&
-			    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
-			    XFS_BUF_COUNT(blip->bli_buf) == len)
-				return blip->bli_buf;
-		}
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		blip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (blip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_TARGET(blip->bli_buf) == target &&
+		    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+		    XFS_BUF_COUNT(blip->bli_buf) == len)
+			return blip->bli_buf;
 	}
 
 	return NULL;
@@ -127,7 +104,7 @@ _xfs_trans_bjoin(
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+	xfs_trans_add_item(tp, &bip->bli_item);
 
 	/*
 	 * Initialize b_fsprivate2 so we can find it with incore_match()
@@ -359,7 +336,7 @@ xfs_trans_read_buf(
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			XFS_BUF_READ(bp);
 			xfsbdstrat(tp->t_mountp, bp);
-			error = xfs_iowait(bp);
+			error = xfs_buf_iowait(bp);
 			if (error) {
 				xfs_ioerror_alert("xfs_trans_read_buf", mp,
 						  bp, blkno);
@@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 {
 	xfs_buf_log_item_t	*bip;
 	xfs_log_item_t		*lip;
-	xfs_log_item_desc_t	*lidp;
 
 	/*
 	 * Default to a normal brelse() call if the tp is NULL.
@@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
-	/*
-	 * Find the item descriptor pointing to this buffer's
-	 * log item.  It must be there.
-	 */
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-	ASSERT(lidp != NULL);
-
 	trace_xfs_trans_brelse(bip);
 
 	/*
@@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	 * If the buffer is dirty within this transaction, we can't
 	 * release it until we commit.
 	 */
-	if (lidp->lid_flags & XFS_LID_DIRTY)
+	if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
 		return;
 
 	/*
@@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	/*
 	 * Free up the log item descriptor tracking the released item.
 	 */
-	xfs_trans_free_item(tp, lidp);
+	xfs_trans_del_item(&bip->bli_item);
 
 	/*
 	 * Clear the hold flag in the buf log item if it is set.
@@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 		  uint		last)
 {
 	xfs_buf_log_item_t	*bip;
-	xfs_log_item_desc_t	*lidp;
 
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
-	bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+	bip->bli_item.li_cb = xfs_buf_iodone;
 
 	trace_xfs_trans_log_buf(bip);
 
@@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 		bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
 	}
 
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 	bip->bli_flags |= XFS_BLI_LOGGED;
 	xfs_buf_item_log(bip, first, last);
 }
@@ -740,7 +705,6 @@ xfs_trans_binval(
 	xfs_trans_t	*tp,
 	xfs_buf_t	*bp)
 {
-	xfs_log_item_desc_t	*lidp;
 	xfs_buf_log_item_t	*bip;
 
 	ASSERT(XFS_BUF_ISBUSY(bp));
@@ -748,8 +712,6 @@ xfs_trans_binval(
 	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-	ASSERT(lidp != NULL);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	trace_xfs_trans_binval(bip);
@@ -764,7 +726,7 @@ xfs_trans_binval(
 		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
 		ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
 		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
 		return;
 	}
@@ -797,7 +759,7 @@ xfs_trans_binval(
 	bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
 	memset((char *)(bip->bli_format.blf_data_map), 0,
 	      (bip->bli_format.blf_map_size * sizeof(uint)));
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 	tp->t_flags |= XFS_TRANS_DIRTY;
 }
 
@@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_STALE_INODE;
-	bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
-		xfs_buf_iodone;
+	bip->bli_item.li_cb = xfs_buf_iodone;
 }
 
-
-
 /*
  * Mark the buffer as being one which contains newly allocated
  * inodes.  We need to make sure that even if this buffer is
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e..f783d5e9fa7 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t	*tp,
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
-
-	return (efip);
+	xfs_trans_add_item(tp, &efip->efi_item);
+	return efip;
 }
 
 /*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
 			 xfs_fsblock_t		start_block,
 			 xfs_extlen_t		ext_len)
 {
-	xfs_log_item_desc_t	*lidp;
 	uint			next_extent;
 	xfs_extent_t		*extp;
 
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
 	next_extent = efip->efi_next_extent;
 	ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t		*tp,
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
-
-	return (efdp);
+	xfs_trans_add_item(tp, &efdp->efd_item);
+	return efdp;
 }
 
 /*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t		*tp,
 			 xfs_fsblock_t		start_block,
 			 xfs_extlen_t		ext_len)
 {
-	xfs_log_item_desc_t	*lidp;
 	uint			next_extent;
 	xfs_extent_t		*extp;
 
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
 	next_extent = efdp->efd_next_extent;
 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2559dfec946..ccb34532768 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 
 #ifdef XFS_TRANS_DEBUG
 STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
 #define	xfs_trans_inode_broot_debug(ip)
 #endif
 
-
 /*
  * Get an inode and join it to the transaction.
  */
@@ -63,76 +58,94 @@ xfs_trans_iget(
 	int			error;
 
 	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-	if (!error && tp)
-		xfs_trans_ijoin(tp, *ipp, lock_flags);
+	if (!error && tp) {
+		xfs_trans_ijoin(tp, *ipp);
+		(*ipp)->i_itemp->ili_lock_flags = lock_flags;
+	}
 	return error;
 }
 
 /*
- * Add the locked inode to the transaction.
- * The inode must be locked, and it cannot be associated with any
- * transaction.  The caller must specify the locks already held
- * on the inode.
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
  */
 void
 xfs_trans_ijoin(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	uint		lock_flags)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
 {
 	xfs_inode_log_item_t	*iip;
 
 	ASSERT(ip->i_transp == NULL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(lock_flags & XFS_ILOCK_EXCL);
 	if (ip->i_itemp == NULL)
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
-	ASSERT(iip->ili_flags == 0);
+	ASSERT(iip->ili_lock_flags == 0);
 
 	/*
 	 * Get a log_item_desc to point at the new item.
 	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+	xfs_trans_add_item(tp, &iip->ili_item);
 
 	xfs_trans_inode_broot_debug(ip);
 
 	/*
-	 * If the IO lock is already held, mark that in the inode log item.
-	 */
-	if (lock_flags & XFS_IOLOCK_EXCL) {
-		iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-	}
-
-	/*
 	 * Initialize i_transp so we can find it with xfs_inode_incore()
 	 * in xfs_trans_iget() above.
 	 */
 	ip->i_transp = tp;
 }
 
-
+/*
+ * Add a locked inode to the transaction.
+ *
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is commited.  The inode will also be unlocked at that point.  The inode
+ * must be locked, and it cannot be associated with any transaction.
+ */
+void
+xfs_trans_ijoin_ref(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	uint			lock_flags)
+{
+	xfs_trans_ijoin(tp, ip);
+	IHOLD(ip);
+	ip->i_itemp->ili_lock_flags = lock_flags;
+}
 
 /*
- * Mark the inode as not needing to be unlocked when the inode item's
- * IOP_UNLOCK() routine is called.  The inode must already be locked
- * and associated with the given transaction.
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
  */
-/*ARGSUSED*/
 void
-xfs_trans_ihold(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip)
+xfs_trans_ichgtime(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			flags)
 {
-	ASSERT(ip->i_transp == tp);
-	ASSERT(ip->i_itemp != NULL);
+	struct inode		*inode = VFS_I(ip);
+	timespec_t		tv;
+
+	ASSERT(tp);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(ip->i_transp == tp);
 
-	ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
-}
+	tv = current_fs_time(inode->i_sb);
 
+	if ((flags & XFS_ICHGTIME_MOD) &&
+	    !timespec_equal(&inode->i_mtime, &tv)) {
+		inode->i_mtime = tv;
+	}
+	if ((flags & XFS_ICHGTIME_CHG) &&
+	    !timespec_equal(&inode->i_ctime, &tv)) {
+		inode->i_ctime = tv;
+	}
+}
 
 /*
  * This is called to mark the fields indicated in fieldmask as needing
@@ -149,17 +162,12 @@ xfs_trans_log_inode(
 	xfs_inode_t	*ip,
 	uint		flags)
 {
-	xfs_log_item_desc_t	*lidp;
-
 	ASSERT(ip->i_transp == tp);
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
-	ASSERT(lidp != NULL);
-
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	lidp->lid_flags |= XFS_LID_DIRTY;
+	ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
 	/*
 	 * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index f11d37d06dc..00000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has its own ailp */
-#include "xfs_bit.h"
-#include "xfs_buf_item.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-
-STATIC int	xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
-					int, int, xfs_lsn_t);
-
-/*
- * This is called to add the given log item to the transaction's
- * list of log items.  It must find a free log item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to item descriptor used to point
- * to the new item.  The log item will now point to its new descriptor
- * with its li_desc field.
- */
-xfs_log_item_desc_t *
-xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_item_chunk_t	*licp;
-	int			i=0;
-
-	/*
-	 * If there are no free descriptors, allocate a new chunk
-	 * of them and put it at the front of the chunk list.
-	 */
-	if (tp->t_items_free == 0) {
-		licp = (xfs_log_item_chunk_t*)
-		       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
-		ASSERT(licp != NULL);
-		/*
-		 * Initialize the chunk, and then
-		 * claim the first slot in the newly allocated chunk.
-		 */
-		xfs_lic_init(licp);
-		xfs_lic_claim(licp, 0);
-		licp->lic_unused = 1;
-		xfs_lic_init_slot(licp, 0);
-		lidp = xfs_lic_slot(licp, 0);
-
-		/*
-		 * Link in the new chunk and update the free count.
-		 */
-		licp->lic_next = tp->t_items.lic_next;
-		tp->t_items.lic_next = licp;
-		tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
-
-		/*
-		 * Initialize the descriptor and the generic portion
-		 * of the log item.
-		 *
-		 * Point the new slot at this item and return it.
-		 * Also point the log item at its currently active
-		 * descriptor and set the item's mount pointer.
-		 */
-		lidp->lid_item = lip;
-		lidp->lid_flags = 0;
-		lidp->lid_size = 0;
-		lip->li_desc = lidp;
-		lip->li_mountp = tp->t_mountp;
-		lip->li_ailp = tp->t_mountp->m_ail;
-		return lidp;
-	}
-
-	/*
-	 * Find the free descriptor. It is somewhere in the chunklist
-	 * of descriptors.
-	 */
-	licp = &tp->t_items;
-	while (licp != NULL) {
-		if (xfs_lic_vacancy(licp)) {
-			if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
-				i = licp->lic_unused;
-				ASSERT(xfs_lic_isfree(licp, i));
-				break;
-			}
-			for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-				if (xfs_lic_isfree(licp, i))
-					break;
-			}
-			ASSERT(i <= XFS_LIC_MAX_SLOT);
-			break;
-		}
-		licp = licp->lic_next;
-	}
-	ASSERT(licp != NULL);
-	/*
-	 * If we find a free descriptor, claim it,
-	 * initialize it, and return it.
-	 */
-	xfs_lic_claim(licp, i);
-	if (licp->lic_unused <= i) {
-		licp->lic_unused = i + 1;
-		xfs_lic_init_slot(licp, i);
-	}
-	lidp = xfs_lic_slot(licp, i);
-	tp->t_items_free--;
-	lidp->lid_item = lip;
-	lidp->lid_flags = 0;
-	lidp->lid_size = 0;
-	lip->li_desc = lidp;
-	lip->li_mountp = tp->t_mountp;
-	lip->li_ailp = tp->t_mountp->m_ail;
-	return lidp;
-}
-
-/*
- * Free the given descriptor.
- *
- * This requires setting the bit in the chunk's free mask corresponding
- * to the given slot.
- */
-void
-xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
-{
-	uint			slot;
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_chunk_t	**licpp;
-
-	slot = xfs_lic_desc_to_slot(lidp);
-	licp = xfs_lic_desc_to_chunk(lidp);
-	xfs_lic_relse(licp, slot);
-	lidp->lid_item->li_desc = NULL;
-	tp->t_items_free++;
-
-	/*
-	 * If there are no more used items in the chunk and this is not
-	 * the chunk embedded in the transaction structure, then free
-	 * the chunk. First pull it from the chunk list and then
-	 * free it back to the heap.  We didn't bother with a doubly
-	 * linked list here because the lists should be very short
-	 * and this is not a performance path.  It's better to save
-	 * the memory of the extra pointer.
-	 *
-	 * Also decrement the transaction structure's count of free items
-	 * by the number in a chunk since we are freeing an empty chunk.
-	 */
-	if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
-		licpp = &(tp->t_items.lic_next);
-		while (*licpp != licp) {
-			ASSERT(*licpp != NULL);
-			licpp = &((*licpp)->lic_next);
-		}
-		*licpp = licp->lic_next;
-		kmem_free(licp);
-		tp->t_items_free -= XFS_LIC_NUM_SLOTS;
-	}
-}
-
-/*
- * This is called to find the descriptor corresponding to the given
- * log item.  It returns a pointer to the descriptor.
- * The log item MUST have a corresponding descriptor in the given
- * transaction.  This routine does not return NULL, it panics.
- *
- * The descriptor pointer is kept in the log item's li_desc field.
- * Just return it.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_find_item(xfs_trans_t	*tp, xfs_log_item_t *lip)
-{
-	ASSERT(lip->li_desc != NULL);
-
-	return lip->li_desc;
-}
-
-
-/*
- * Return a pointer to the first descriptor in the chunk list.
- * This does not return NULL if there are none, it panics.
- *
- * The first descriptor must be in either the first or second chunk.
- * This is because the only chunk allowed to be empty is the first.
- * All others are freed when they become empty.
- *
- * At some point this and xfs_trans_next_item() should be optimized
- * to quickly look at the mask to determine if there is anything to
- * look at.
- */
-xfs_log_item_desc_t *
-xfs_trans_first_item(xfs_trans_t *tp)
-{
-	xfs_log_item_chunk_t	*licp;
-	int			i;
-
-	licp = &tp->t_items;
-	/*
-	 * If it's not in the first chunk, skip to the second.
-	 */
-	if (xfs_lic_are_all_free(licp)) {
-		licp = licp->lic_next;
-	}
-
-	/*
-	 * Return the first non-free descriptor in the chunk.
-	 */
-	ASSERT(!xfs_lic_are_all_free(licp));
-	for (i = 0; i < licp->lic_unused; i++) {
-		if (xfs_lic_isfree(licp, i)) {
-			continue;
-		}
-
-		return xfs_lic_slot(licp, i);
-	}
-	cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
-	return NULL;
-}
-
-
-/*
- * Given a descriptor, return the next descriptor in the chunk list.
- * This returns NULL if there are no more used descriptors in the list.
- *
- * We do this by first locating the chunk in which the descriptor resides,
- * and then scanning forward in the chunk and the list for the next
- * used descriptor.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
-	xfs_log_item_chunk_t	*licp;
-	int			i;
-
-	licp = xfs_lic_desc_to_chunk(lidp);
-
-	/*
-	 * First search the rest of the chunk. The for loop keeps us
-	 * from referencing things beyond the end of the chunk.
-	 */
-	for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-		if (xfs_lic_isfree(licp, i)) {
-			continue;
-		}
-
-		return xfs_lic_slot(licp, i);
-	}
-
-	/*
-	 * Now search the next chunk.  It must be there, because the
-	 * next chunk would have been freed if it were empty.
-	 * If there is no next chunk, return NULL.
-	 */
-	if (licp->lic_next == NULL) {
-		return NULL;
-	}
-
-	licp = licp->lic_next;
-	ASSERT(!xfs_lic_are_all_free(licp));
-	for (i = 0; i < licp->lic_unused; i++) {
-		if (xfs_lic_isfree(licp, i)) {
-			continue;
-		}
-
-		return xfs_lic_slot(licp, i);
-	}
-	ASSERT(0);
-	/* NOTREACHED */
-	return NULL; /* keep gcc quite */
-}
-
-/*
- * This is called to unlock all of the items of a transaction and to free
- * all the descriptors of that transaction.
- *
- * It walks the list of descriptors and unlocks each item.  It frees
- * each chunk except that embedded in the transaction as it goes along.
- */
-void
-xfs_trans_free_items(
-	xfs_trans_t	*tp,
-	xfs_lsn_t	commit_lsn,
-	int		flags)
-{
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_chunk_t	*next_licp;
-	int			abort;
-
-	abort = flags & XFS_TRANS_ABORT;
-	licp = &tp->t_items;
-	/*
-	 * Special case the embedded chunk so we don't free it below.
-	 */
-	if (!xfs_lic_are_all_free(licp)) {
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
-		xfs_lic_all_free(licp);
-		licp->lic_unused = 0;
-	}
-	licp = licp->lic_next;
-
-	/*
-	 * Unlock each item in each chunk and free the chunks.
-	 */
-	while (licp != NULL) {
-		ASSERT(!xfs_lic_are_all_free(licp));
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
-		next_licp = licp->lic_next;
-		kmem_free(licp);
-		licp = next_licp;
-	}
-
-	/*
-	 * Reset the transaction structure's free item count.
-	 */
-	tp->t_items_free = XFS_LIC_NUM_SLOTS;
-	tp->t_items.lic_next = NULL;
-}
-
-
-
-/*
- * This is called to unlock the items associated with a transaction.
- * Items which were not logged should be freed.
- * Those which were logged must still be tracked so they can be unpinned
- * when the transaction commits.
- */
-void
-xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
-{
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_chunk_t	*next_licp;
-	xfs_log_item_chunk_t	**licpp;
-	int			freed;
-
-	freed = 0;
-	licp = &tp->t_items;
-
-	/*
-	 * Special case the embedded chunk so we don't free.
-	 */
-	if (!xfs_lic_are_all_free(licp)) {
-		freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-	}
-	licpp = &(tp->t_items.lic_next);
-	licp = licp->lic_next;
-
-	/*
-	 * Unlock each item in each chunk, free non-dirty descriptors,
-	 * and free empty chunks.
-	 */
-	while (licp != NULL) {
-		ASSERT(!xfs_lic_are_all_free(licp));
-		freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-		next_licp = licp->lic_next;
-		if (xfs_lic_are_all_free(licp)) {
-			*licpp = next_licp;
-			kmem_free(licp);
-			freed -= XFS_LIC_NUM_SLOTS;
-		} else {
-			licpp = &(licp->lic_next);
-		}
-		ASSERT(*licpp == next_licp);
-		licp = next_licp;
-	}
-
-	/*
-	 * Fix the free descriptor count in the transaction.
-	 */
-	tp->t_items_free += freed;
-}
-
-/*
- * Unlock each item pointed to by a descriptor in the given chunk.
- * Stamp the commit lsn into each item if necessary.
- * Free descriptors pointing to items which are not dirty if freeing_chunk
- * is zero. If freeing_chunk is non-zero, then we need to unlock all
- * items in the chunk.
- * 
- * Return the number of descriptors freed.
- */
-STATIC int
-xfs_trans_unlock_chunk(
-	xfs_log_item_chunk_t	*licp,
-	int			freeing_chunk,
-	int			abort,
-	xfs_lsn_t		commit_lsn)
-{
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_item_t		*lip;
-	int			i;
-	int			freed;
-
-	freed = 0;
-	lidp = licp->lic_descs;
-	for (i = 0; i < licp->lic_unused; i++, lidp++) {
-		if (xfs_lic_isfree(licp, i)) {
-			continue;
-		}
-		lip = lidp->lid_item;
-		lip->li_desc = NULL;
-
-		if (commit_lsn != NULLCOMMITLSN)
-			IOP_COMMITTING(lip, commit_lsn);
-		if (abort)
-			lip->li_flags |= XFS_LI_ABORTED;
-		IOP_UNLOCK(lip);
-
-		/*
-		 * Free the descriptor if the item is not dirty
-		 * within this transaction and the caller is not
-		 * going to just free the entire thing regardless.
-		 */
-		if (!(freeing_chunk) &&
-		    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-			xfs_lic_relse(licp, i);
-			freed++;
-		}
-	}
-
-	return freed;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c6e4f2c8de6..62da86c90de 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,23 +23,10 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 
-/*
- * From xfs_trans_item.c
- */
-struct xfs_log_item_desc	*xfs_trans_add_item(struct xfs_trans *,
-					    struct xfs_log_item *);
-void				xfs_trans_free_item(struct xfs_trans *,
-					    struct xfs_log_item_desc *);
-struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
-					     struct xfs_log_item *);
-struct xfs_log_item_desc	*xfs_trans_first_item(struct xfs_trans *);
-struct xfs_log_item_desc	*xfs_trans_next_item(struct xfs_trans *,
-					     struct xfs_log_item_desc *);
-
-void	xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
+void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
 				int flags);
-
 void	xfs_trans_item_committed(struct xfs_log_item *lip,
 				xfs_lsn_t commit_lsn, int aborted);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e3..26d1867d815 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
 typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
-typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
-
 typedef __uint32_t	xlog_tid_t;	/* transaction ID type */
 
 /*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde9..8b32d1a4c5a 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_rw.h"
 #include "xfs_itable.h"
 #include "xfs_utils.h"
 
@@ -60,7 +56,6 @@ xfs_dir_ialloc(
 	mode_t		mode,
 	xfs_nlink_t	nlink,
 	xfs_dev_t	rdev,
-	cred_t		*credp,
 	prid_t		prid,		/* project id */
 	int		okalloc,	/* ok to allocate new space */
 	xfs_inode_t	**ipp,		/* pointer to inode; it will be
@@ -97,7 +92,7 @@ xfs_dir_ialloc(
 	 * transaction commit so that no other process can steal
 	 * the inode(s) that we've just allocated.
 	 */
-	code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
 			  &ialloc_context, &call_again, &ip);
 
 	/*
@@ -201,7 +196,7 @@ xfs_dir_ialloc(
 		 * other allocations in this allocation group,
 		 * this call should always succeed.
 		 */
-		code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
 				  okalloc, &ialloc_context, &call_again, &ip);
 
 		/*
@@ -239,7 +234,7 @@ xfs_droplink(
 {
 	int	error;
 
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	ASSERT (ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink--;
@@ -303,7 +298,7 @@ xfs_bumplink(
 {
 	if (ip->i_d.di_nlink >= XFS_MAXLINK)
 		return XFS_ERROR(EMLINK);
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	ASSERT(ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink++;
@@ -324,86 +319,3 @@ xfs_bumplink(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	return 0;
 }
-
-/*
- * Try to truncate the given file to 0 length.  Currently called
- * only out of xfs_remove when it has to truncate a file to free
- * up space for the remove to proceed.
- */
-int
-xfs_truncate_file(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip)
-{
-	xfs_trans_t	*tp;
-	int		error;
-
-#ifdef QUOTADEBUG
-	/*
-	 * This is called to truncate the quotainodes too.
-	 */
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		if (ip->i_ino != mp->m_sb.sb_uquotino)
-			ASSERT(ip->i_udquot);
-	}
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		if (ip->i_ino != mp->m_sb.sb_gquotino)
-			ASSERT(ip->i_gdquot);
-	}
-#endif
-	/*
-	 * Make the call to xfs_itruncate_start before starting the
-	 * transaction, because we cannot make the call while we're
-	 * in a transaction.
-	 */
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
-	if (error) {
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error;
-	}
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-				      XFS_TRANS_PERM_LOG_RES,
-				      XFS_ITRUNCATE_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error;
-	}
-
-	/*
-	 * Follow the normal truncate locking protocol.  Since we
-	 * hold the inode in the transaction, we know that its number
-	 * of references will stay constant.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
-	/*
-	 * Signal a sync xaction.  The only case where that isn't
-	 * the case is if we're truncating an already unlinked file
-	 * on a wsync fs.  In that case, we know the blocks can't
-	 * reappear in the file because the links to file are
-	 * permanently toast.  Currently, we're always going to
-	 * want a sync transaction because this code is being
-	 * called from places where nlink is guaranteed to be 1
-	 * but I'm leaving the tests in to protect against future
-	 * changes -- rcc.
-	 */
-	error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
-				     XFS_DATA_FORK,
-				     ((ip->i_d.di_nlink != 0 ||
-				       !(mp->m_flags & XFS_MOUNT_WSYNC))
-				      ? 1 : 0));
-	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-				 XFS_TRANS_ABORT);
-	} else {
-		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-	return error;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d26..456fca31493 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,10 +18,8 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
 
-extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-				xfs_dev_t, cred_t *, prid_t, int,
-				xfs_inode_t **, int *);
+				xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
 extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
 extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
 extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c1646838898..8e4a63c4151 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 	int			need_iolock = 1;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_setattr(ip);
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return XFS_ERROR(EROFS);
@@ -119,7 +114,7 @@ xfs_setattr(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
+		code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
 					 qflags, &udqp, &gdqp);
 		if (code)
 			return code;
@@ -143,16 +138,6 @@ xfs_setattr(
 			goto error_return;
 		}
 	} else {
-		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
-		    !(flags & XFS_ATTR_DMI)) {
-			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
-				iattr->ia_size, 0, dmflags, NULL);
-			if (code) {
-				lock_flags = 0;
-				goto error_return;
-			}
-		}
 		if (need_iolock)
 			lock_flags |= XFS_IOLOCK_EXCL;
 	}
@@ -199,8 +184,11 @@ xfs_setattr(
 		    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
-			if (mask & ATTR_CTIME)
-				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			if (mask & ATTR_CTIME) {
+				inode->i_mtime = inode->i_ctime =
+						current_fs_time(inode->i_sb);
+				xfs_mark_inode_dirty_sync(ip);
+			}
 			code = 0;
 			goto error_return;
 		}
@@ -236,8 +224,11 @@ xfs_setattr(
 			 * transaction to modify the i_size.
 			 */
 			code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+			if (code)
+				goto error_return;
 		}
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		lock_flags &= ~XFS_ILOCK_EXCL;
 
 		/*
 		 * We are going to log the inode size change in this
@@ -251,40 +242,38 @@ xfs_setattr(
 		 * really care about here and prevents waiting for other data
 		 * not within the range we care about here.
 		 */
-		if (!code &&
-		    ip->i_size != ip->i_d.di_size &&
+		if (ip->i_size != ip->i_d.di_size &&
 		    iattr->ia_size > ip->i_d.di_size) {
 			code = xfs_flush_pages(ip,
 					ip->i_d.di_size, iattr->ia_size,
 					XBF_ASYNC, FI_NONE);
+			if (code)
+				goto error_return;
 		}
 
 		/* wait for all I/O to complete */
 		xfs_ioend_wait(ip);
 
-		if (!code)
-			code = xfs_itruncate_data(ip, iattr->ia_size);
-		if (code) {
-			ASSERT(tp == NULL);
-			lock_flags &= ~XFS_ILOCK_EXCL;
-			ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
+		code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+					    xfs_get_blocks);
+		if (code)
 			goto error_return;
-		}
+
 		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-		if ((code = xfs_trans_reserve(tp, 0,
-					     XFS_ITRUNCATE_LOG_RES(mp), 0,
-					     XFS_TRANS_PERM_LOG_RES,
-					     XFS_ITRUNCATE_LOG_COUNT))) {
-			xfs_trans_cancel(tp, 0);
-			if (need_iolock)
-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-			return code;
-		}
+		code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+					 XFS_TRANS_PERM_LOG_RES,
+					 XFS_ITRUNCATE_LOG_COUNT);
+		if (code)
+			goto error_return;
+
+		truncate_setsize(inode, iattr->ia_size);
+
 		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+		lock_flags |= XFS_ILOCK_EXCL;
+
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-		xfs_trans_ijoin(tp, ip, lock_flags);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		/*
 		 * Only change the c/mtime if we are changing the size
@@ -334,8 +323,7 @@ xfs_setattr(
 			xfs_iflags_set(ip, XFS_ITRUNCATED);
 		}
 	} else if (tp) {
-		xfs_trans_ijoin(tp, ip, lock_flags);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 	}
 
 	/*
@@ -470,17 +458,10 @@ xfs_setattr(
 			return XFS_ERROR(code);
 	}
 
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
-	    !(flags & XFS_ATTR_DMI)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL, NULL, NULL,
-					0, 0, AT_DELAY_FLAG(flags));
-	}
 	return 0;
 
  abort_return:
 	commit_flags |= XFS_TRANS_ABORT;
-	/* FALLTHROUGH */
  error_return:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
@@ -516,7 +497,7 @@ xfs_readlink_bmap(
 	int		error = 0;
 
 	error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
-			mval, &nmaps, NULL, NULL);
+			mval, &nmaps, NULL);
 	if (error)
 		goto out;
 
@@ -557,7 +538,7 @@ xfs_readlink(
 	int		pathlen;
 	int		error = 0;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_readlink(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -613,14 +594,14 @@ xfs_free_eofblocks(
 	 */
 	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
 	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-	map_len = last_fsb - end_fsb;
-	if (map_len <= 0)
+	if (last_fsb <= end_fsb)
 		return 0;
+	map_len = last_fsb - end_fsb;
 
 	nimaps = 1;
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-			  NULL, 0, &imap, &nimaps, NULL, NULL);
+			  NULL, 0, &imap, &nimaps, NULL);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
@@ -675,10 +656,7 @@ xfs_free_eofblocks(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip,
-				XFS_IOLOCK_EXCL |
-				XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		error = xfs_itruncate_finish(&tp, ip,
 					     ip->i_size,
@@ -750,8 +728,7 @@ xfs_inactive_symlink_rmt(
 	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 	size = (int)ip->i_d.di_size;
 	ip->i_d.di_size = 0;
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Find the block(s) so we can inval and unmap them.
@@ -761,7 +738,7 @@ xfs_inactive_symlink_rmt(
 	nmaps = ARRAY_SIZE(mval);
 	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-			&free_list, NULL)))
+			&free_list)))
 		goto error0;
 	/*
 	 * Invalidate the block(s).
@@ -776,7 +753,7 @@ xfs_inactive_symlink_rmt(
 	 * Unmap the dead block(s) to the free_list.
 	 */
 	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-			&first_block, &free_list, NULL, &done)))
+			&first_block, &free_list, &done)))
 		goto error1;
 	ASSERT(done);
 	/*
@@ -795,8 +772,7 @@ xfs_inactive_symlink_rmt(
 	 * Mark it dirty so it will be logged and moved forward in the log as
 	 * part of every commit.
 	 */
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	/*
 	 * Get a new, empty transaction to return to our caller.
@@ -929,8 +905,7 @@ xfs_inactive_attrs(
 		goto error_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 
 	ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1010,6 @@ xfs_inactive(
 	int		error;
 	int		truncate;
 
-	xfs_itrace_entry(ip);
-
 	/*
 	 * If the inode is already free, then there can be nothing
 	 * to clean up here.
@@ -1060,9 +1033,6 @@ xfs_inactive(
 
 	mp = ip->i_mount;
 
-	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
-		XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
-
 	error = 0;
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1090,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		/*
 		 * normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1123,7 @@ xfs_inactive(
 			return VN_INACTIVE_CACHE;
 		}
 
-		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 	} else {
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1136,7 @@ xfs_inactive(
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 	}
 
 	/*
@@ -1257,7 +1224,7 @@ xfs_lookup(
 	int			error;
 	uint			lock_mode;
 
-	xfs_itrace_entry(dp);
+	trace_xfs_lookup(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return XFS_ERROR(EIO);
@@ -1289,8 +1256,7 @@ xfs_create(
 	struct xfs_name		*name,
 	mode_t			mode,
 	xfs_dev_t		rdev,
-	xfs_inode_t		**ipp,
-	cred_t			*credp)
+	xfs_inode_t		**ipp)
 {
 	int			is_dir = S_ISDIR(mode);
 	struct xfs_mount	*mp = dp->i_mount;
@@ -1302,32 +1268,22 @@ xfs_create(
 	boolean_t		unlock_dp_on_error = B_FALSE;
 	uint			cancel_flags;
 	int			committed;
-	xfs_prid_t		prid;
+	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
 	uint			resblks;
 	uint			log_res;
 	uint			log_count;
 
-	xfs_itrace_entry(dp);
+	trace_xfs_create(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-				dp, DM_RIGHT_NULL, NULL,
-				DM_RIGHT_NULL, name->name, NULL,
-				mode, 0, 0);
-
-		if (error)
-			return error;
-	}
-
 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = dp->i_d.di_projid;
+		prid = xfs_get_projid(dp);
 	else
-		prid = dfltprid;
+		prid = XFS_PROJID_DEFAULT;
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
@@ -1406,7 +1362,7 @@ xfs_create(
 	 * entry pointing to them, but a directory also the "." entry
 	 * pointing to itself.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 			       prid, resblks > 0, &ip, &committed);
 	if (error) {
 		if (error == ENOSPC)
@@ -1427,8 +1383,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	IHOLD(dp);
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1438,7 +1393,7 @@ xfs_create(
 		ASSERT(error != ENOSPC);
 		goto out_trans_abort;
 	}
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
 	if (is_dir) {
@@ -1487,16 +1442,7 @@ xfs_create(
 	xfs_qm_dqrele(gdqp);
 
 	*ipp = ip;
-
-	/* Fallthrough to std_return with error = 0  */
- std_return:
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-		XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
-				ip, DM_RIGHT_NULL, name->name, NULL, mode,
-				error, 0);
-	}
-
-	return error;
+	return 0;
 
  out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
@@ -1510,8 +1456,8 @@ xfs_create(
 
 	if (unlock_dp_on_error)
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-	goto std_return;
+ std_return:
+	return error;
 
  out_abort_rele:
 	/*
@@ -1726,20 +1672,11 @@ xfs_remove(
 	uint			resblks;
 	uint			log_count;
 
-	xfs_itrace_entry(dp);
-	xfs_itrace_entry(ip);
+	trace_xfs_remove(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL, name->name, NULL,
-					ip->i_d.di_mode, 0, 0);
-		if (error)
-			return error;
-	}
-
 	error = xfs_qm_dqattach(dp, 0);
 	if (error)
 		goto std_return;
@@ -1782,15 +1719,8 @@ xfs_remove(
 
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-	/*
-	 * At this point, we've gotten both the directory and the entry
-	 * inodes locked.
-	 */
-	IHOLD(ip);
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-
-	IHOLD(dp);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we're removing a directory perform some additional validation.
@@ -1814,7 +1744,7 @@ xfs_remove(
 		ASSERT(error != ENOENT);
 		goto out_bmap_cancel;
 	}
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	if (is_dir) {
 		/*
@@ -1877,21 +1807,15 @@ xfs_remove(
 	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
 		xfs_filestream_deassociate(ip);
 
- std_return:
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-		XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
-				NULL, DM_RIGHT_NULL, name->name, NULL,
-				ip->i_d.di_mode, error, 0);
-	}
-
-	return error;
+	return 0;
 
  out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 	cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
 	xfs_trans_cancel(tp, cancel_flags);
-	goto std_return;
+ std_return:
+	return error;
 }
 
 int
@@ -1909,25 +1833,13 @@ xfs_link(
 	int			committed;
 	int			resblks;
 
-	xfs_itrace_entry(tdp);
-	xfs_itrace_entry(sip);
+	trace_xfs_link(tdp, target_name);
 
 	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-					tdp, DM_RIGHT_NULL,
-					sip, DM_RIGHT_NULL,
-					target_name->name, NULL, 0, 0, 0);
-		if (error)
-			return error;
-	}
-
-	/* Return through std_return after this point. */
-
 	error = xfs_qm_dqattach(sip, 0);
 	if (error)
 		goto std_return;
@@ -1953,15 +1865,8 @@ xfs_link(
 
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
-	/*
-	 * Increment vnode ref counts since xfs_trans_commit &
-	 * xfs_trans_cancel will both unlock the inodes and
-	 * decrement the associated ref counts.
-	 */
-	IHOLD(sip);
-	IHOLD(tdp);
-	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * If the source has too many links, we can't make any more to it.
@@ -1977,7 +1882,7 @@ xfs_link(
 	 * the tree quota mechanism could be circumvented.
 	 */
 	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
+		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
 		error = XFS_ERROR(EXDEV);
 		goto error_return;
 	}
@@ -1992,7 +1897,7 @@ xfs_link(
 					&first_block, &free_list, resblks);
 	if (error)
 		goto abort_return;
-	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
 	error = xfs_bumplink(tp, sip);
@@ -2014,27 +1919,14 @@ xfs_link(
 		goto abort_return;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (error)
-		goto std_return;
-
-	/* Fall through to std_return with error = 0. */
-std_return:
-	if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-				tdp, DM_RIGHT_NULL,
-				sip, DM_RIGHT_NULL,
-				target_name->name, NULL, 0, error, 0);
-	}
-	return error;
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
  abort_return:
 	cancel_flags |= XFS_TRANS_ABORT;
-	/* FALLTHROUGH */
-
  error_return:
 	xfs_trans_cancel(tp, cancel_flags);
-	goto std_return;
+ std_return:
+	return error;
 }
 
 int
@@ -2043,8 +1935,7 @@ xfs_symlink(
 	struct xfs_name		*link_name,
 	const char		*target_path,
 	mode_t			mode,
-	xfs_inode_t		**ipp,
-	cred_t			*credp)
+	xfs_inode_t		**ipp)
 {
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t		*tp;
@@ -2065,7 +1956,7 @@ xfs_symlink(
 	int			byte_cnt;
 	int			n;
 	xfs_buf_t		*bp;
-	xfs_prid_t		prid;
+	prid_t			prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
 
@@ -2074,7 +1965,7 @@ xfs_symlink(
 	ip = NULL;
 	tp = NULL;
 
-	xfs_itrace_entry(dp);
+	trace_xfs_symlink(dp, link_name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -2086,22 +1977,11 @@ xfs_symlink(
 	if (pathlen >= MAXPATHLEN)      /* total string too long */
 		return XFS_ERROR(ENAMETOOLONG);
 
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
-					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					link_name->name,
-					(unsigned char *)target_path, 0, 0, 0);
-		if (error)
-			return error;
-	}
-
-	/* Return through std_return after this point. */
-
 	udqp = gdqp = NULL;
 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = dp->i_d.di_projid;
+		prid = xfs_get_projid(dp);
 	else
-		prid = (xfs_prid_t)dfltprid;
+		prid = XFS_PROJID_DEFAULT;
 
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
@@ -2167,8 +2047,8 @@ xfs_symlink(
 	/*
 	 * Allocate an inode for the symlink.
 	 */
-	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
-			       1, 0, credp, prid, resblks > 0, &ip, NULL);
+	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+			       prid, resblks > 0, &ip, NULL);
 	if (error) {
 		if (error == ENOSPC)
 			goto error_return;
@@ -2180,8 +2060,7 @@ xfs_symlink(
 	 * transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	IHOLD(dp);
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
 	/*
@@ -2215,7 +2094,7 @@ xfs_symlink(
 		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
 				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
 				  &first_block, resblks, mval, &nmaps,
-				  &free_list, NULL);
+				  &free_list);
 		if (error) {
 			goto error1;
 		}
@@ -2251,7 +2130,7 @@ xfs_symlink(
 					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
 	/*
@@ -2278,21 +2157,8 @@ xfs_symlink(
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
 
-	/* Fall through to std_return with error = 0 or errno from
-	 * xfs_trans_commit	*/
-std_return:
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-					dp, DM_RIGHT_NULL,
-					error ? NULL : ip,
-					DM_RIGHT_NULL, link_name->name,
-					(unsigned char *)target_path,
-					0, error, 0);
-	}
-
-	if (!error)
-		*ipp = ip;
-	return error;
+	*ipp = ip;
+	return 0;
 
  error2:
 	IRELE(ip);
@@ -2306,8 +2172,8 @@ std_return:
 
 	if (unlock_dp_on_error)
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-	goto std_return;
+ std_return:
+	return error;
 }
 
 int
@@ -2333,13 +2199,12 @@ xfs_set_dmattrs(
 		return error;
 	}
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 
 	ip->i_d.di_dmevmask = evmask;
 	ip->i_d.di_dmstate  = state;
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	IHOLD(ip);
 	error = xfs_trans_commit(tp, 0);
 
 	return error;
@@ -2390,7 +2255,7 @@ xfs_alloc_file_space(
 	int			committed;
 	int			error;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_alloc_file_space(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -2408,29 +2273,13 @@ xfs_alloc_file_space(
 	count = len;
 	imapp = &imaps[0];
 	nimaps = 1;
-	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+	bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
-	/*	Generate a DMAPI event if needed.	*/
-	if (alloc_type != 0 && offset < ip->i_size &&
-			(attr_flags & XFS_ATTR_DMI) == 0  &&
-			DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-		xfs_off_t           end_dmi_offset;
-
-		end_dmi_offset = offset+len;
-		if (end_dmi_offset > ip->i_size)
-			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
-				      end_dmi_offset - offset, 0, NULL);
-		if (error)
-			return error;
-	}
-
 	/*
 	 * Allocate file space until done or until there is an error
 	 */
-retry:
 	while (allocatesize_fsb && !error) {
 		xfs_fileoff_t	s, e;
 
@@ -2451,15 +2300,22 @@ retry:
 			e = allocatesize_fsb;
 		}
 
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
 		if (unlikely(rt)) {
-			resrtextents = qblocks = (uint)(e - s);
+			resrtextents = qblocks = resblks;
 			resrtextents /= mp->m_sb.sb_rextsize;
 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
 			quota_flag = XFS_QMOPT_RES_RTBLKS;
 		} else {
 			resrtextents = 0;
-			resblks = qblocks = \
-				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
+			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
 			quota_flag = XFS_QMOPT_RES_REGBLKS;
 		}
 
@@ -2488,8 +2344,7 @@ retry:
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		/*
 		 * Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2353,7 @@ retry:
 		error = xfs_bmapi(tp, ip, startoffset_fsb,
 				  allocatesize_fsb, bmapi_flag,
 				  &firstfsb, 0, imapp, &nimaps,
-				  &free_list, NULL);
+				  &free_list);
 		if (error) {
 			goto error0;
 		}
@@ -2527,17 +2382,6 @@ retry:
 		startoffset_fsb += allocated_fsb;
 		allocatesize_fsb -= allocated_fsb;
 	}
-dmapi_enospc_check:
-	if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
-	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-				ip, DM_RIGHT_NULL,
-				ip, DM_RIGHT_NULL,
-				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
-		if (error == 0)
-			goto retry;	/* Maybe DMAPI app. has made space */
-		/* else fall through with error from XFS_SEND_DATA */
-	}
 
 	return error;
 
@@ -2548,7 +2392,7 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	goto dmapi_enospc_check;
+	return error;
 }
 
 /*
@@ -2588,9 +2432,9 @@ xfs_zero_remaining_bytes(
 	if (endoff > ip->i_size)
 		endoff = ip->i_size;
 
-	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
-				XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp);
+	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+					mp->m_rtdev_targp : mp->m_ddev_targp,
+				mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
 	if (!bp)
 		return XFS_ERROR(ENOMEM);
 
@@ -2598,7 +2442,7 @@ xfs_zero_remaining_bytes(
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
 		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
-			NULL, 0, &imap, &nimap, NULL, NULL);
+			NULL, 0, &imap, &nimap, NULL);
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
@@ -2616,7 +2460,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
-		error = xfs_iowait(bp);
+		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
 					  mp, bp, XFS_BUF_ADDR(bp));
@@ -2629,7 +2473,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
 		xfsbdstrat(mp, bp);
-		error = xfs_iowait(bp);
+		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
 					  mp, bp, XFS_BUF_ADDR(bp));
@@ -2661,7 +2505,6 @@ xfs_free_file_space(
 {
 	int			committed;
 	int			done;
-	xfs_off_t		end_dmi_offset;
 	xfs_fileoff_t		endoffset_fsb;
 	int			error;
 	xfs_fsblock_t		firstfsb;
@@ -2680,7 +2523,7 @@ xfs_free_file_space(
 
 	mp = ip->i_mount;
 
-	xfs_itrace_entry(ip);
+	trace_xfs_free_file_space(ip);
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
@@ -2691,19 +2534,7 @@ xfs_free_file_space(
 		return error;
 	rt = XFS_IS_REALTIME_INODE(ip);
 	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
-	end_dmi_offset = offset + len;
-	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
-
-	if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
-	    DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-		if (end_dmi_offset > ip->i_size)
-			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
-				offset, end_dmi_offset - offset,
-				AT_DELAY_FLAG(attr_flags), NULL);
-		if (error)
-			return error;
-	}
+	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
 	if (attr_flags & XFS_ATTR_NOLOCK)
 		need_iolock = 0;
@@ -2731,7 +2562,7 @@ xfs_free_file_space(
 	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
 		nimap = 1;
 		error = xfs_bmapi(NULL, ip, startoffset_fsb,
-			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+			1, 0, NULL, 0, &imap, &nimap, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2577,7 @@ xfs_free_file_space(
 		}
 		nimap = 1;
 		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
-			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+			1, 0, NULL, 0, &imap, &nimap, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2645,7 @@ xfs_free_file_space(
 		if (error)
 			goto error1;
 
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
+		xfs_trans_ijoin(tp, ip);
 
 		/*
 		 * issue the bunmapi() call to free the blocks
@@ -2823,7 +2653,7 @@ xfs_free_file_space(
 		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bunmapi(tp, ip, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
-				  0, 2, &firstfsb, &free_list, NULL, &done);
+				  0, 2, &firstfsb, &free_list, &done);
 		if (error) {
 			goto error0;
 		}
@@ -2882,8 +2712,7 @@ xfs_change_file_space(
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
 	struct iattr	iattr;
-
-	xfs_itrace_entry(ip);
+	int		prealloc_type;
 
 	if (!S_ISREG(ip->i_d.di_mode))
 		return XFS_ERROR(EINVAL);
@@ -2926,12 +2755,17 @@ xfs_change_file_space(
 	 * size to be changed.
 	 */
 	setprealloc = clrprealloc = 0;
+	prealloc_type = XFS_BMAPI_PREALLOC;
 
 	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+		prealloc_type |= XFS_BMAPI_CONVERT;
+		xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+		/* FALLTHRU */
 	case XFS_IOC_RESVSP:
 	case XFS_IOC_RESVSP64:
 		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-								1, attr_flags);
+						prealloc_type, attr_flags);
 		if (error)
 			return error;
 		setprealloc = 1;
@@ -2985,8 +2819,7 @@ xfs_change_file_space(
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
+	xfs_trans_ijoin(tp, ip);
 
 	if ((attr_flags & XFS_ATTR_DMI) == 0) {
 		ip->i_d.di_mode &= ~S_ISUID;
@@ -3001,7 +2834,7 @@ xfs_change_file_space(
 		if (ip->i_d.di_mode & S_IXGRP)
 			ip->i_d.di_mode &= ~S_ISGID;
 
-		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
 	if (setprealloc)
 		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dad..f6702927eee 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
 #define _XFS_VNODEOPS_H 1
 
 struct attrlist_cursor_kern;
-struct cred;
 struct file;
 struct iattr;
 struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-		xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-		const char *target_path, mode_t mode, struct xfs_inode **ipp,
-		cred_t *credp);
+		const char *target_path, mode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
author	Michal Marek <mmarek@suse.cz>	2010-12-14 22:01:55 +0100
committer	Michal Marek <mmarek@suse.cz>	2010-12-14 22:01:55 +0100
commit	8990c1bc4be46473ad19bf2fa612ca57286f3df4 (patch)
tree	3cea60576903a1d26c67e6ec62891b524d390e95 /fs
parent	2979076fbf17a0947d6eba367b0cac19c907c160 (diff)
parent	c8ddb2713c624f432fa5fe3c7ecffcdda46ea0d4 (diff)