From 5d0af85cd0964bb845b63d5059bb20e8f7731e65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 28 Oct 2010 21:37:10 +0000
Subject: xfs: remove experimental tag from the delaylog option

We promised to do this for 2.6.37, and the code looks stable enough to
keep that promise.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae..064f964d4f3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
 			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
 		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
 			mp->m_flags |= XFS_MOUNT_DELAYLOG;
-			cmn_err(CE_WARN,
-				"Enabling EXPERIMENTAL delayed logging feature "
-				"- use at your own risk.\n");
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
-- 
cgit v1.2.3


From 6762b938eac878a30a90e770ac655874c36bc642 Mon Sep 17 00:00:00 2001
From: Kulikov Vasiliy <segooon@gmail.com>
Date: Sat, 30 Oct 2010 14:26:17 +0000
Subject: xfs: xfs_ioctl: fix information leak to userland

al_hreq is copied from userland.  If al_hreq.buflen is not properly aligned
then xfs_attr_list will ignore the last bytes of kbuf.  These bytes are
unitialized.  It leads to leaking of contents of kernel stack memory.

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38..ad442d9e392 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+	kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
 		goto out_dput;
 
-- 
cgit v1.2.3


From f83282a8ef799c0bdcb0c32971487087da1bc216 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Nov 2010 08:55:04 +0000
Subject: xfs: fix per-ag reference counting in inode reclaim tree walking

The walk fails to decrement the per-ag reference count when the
non-blocking walk fails to obtain the per-ag reclaim lock, leading
to an assert failure on debug kernels when unmounting a filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 1 +
 fs/xfs/xfs_mount.c          | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981..afb0d7cfad1 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -853,6 +853,7 @@ restart:
 		if (trylock) {
 			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
 				skipped++;
+				xfs_perag_put(pag);
 				continue;
 			}
 			first_index = pag->pag_ici_reclaim_cursor;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a39..19e9dfa1c25 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
-- 
cgit v1.2.3


From bfe2741967eaa3434fa9b3d8f24b1422d4540e7d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Nov 2010 08:55:05 +0000
Subject: xfs: move delayed write buffer trace

The delayed write buffer split trace currently issues a trace for
every buffer it scans. These buffers are not necessarily queued for
delayed write. Indeed, when buffers are pinned, there can be
thousands of traces of buffers that aren't actually queued for
delayed write and the ones that are are lost in the noise. Move the
trace point to record only buffers that are split out for IO to be
issued on.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb5..aa1d353def2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1781,7 +1781,6 @@ xfs_buf_delwri_split(
 	INIT_LIST_HEAD(list);
 	spin_lock(dwlk);
 	list_for_each_entry_safe(bp, n, dwq, b_list) {
-		trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		ASSERT(bp->b_flags & XBF_DELWRI);
 
 		if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1794,7 @@ xfs_buf_delwri_split(
 					 _XBF_RUN_QUEUES);
 			bp->b_flags |= XBF_WRITE;
 			list_move_tail(&bp->b_list, list);
+			trace_xfs_buf_delwri_split(bp, _RET_IP_);
 		} else
 			skipped++;
 	}
-- 
cgit v1.2.3


From 785ce41805ea7b6a9b2775ed9f4cf10cd7a90c03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:42:44 +0000
Subject: xfs: tell lockdep about parent iolock usage in filestreams

The filestreams code may take the iolock on the parent inode while
holding it on a child.  This is the only place in XFS where we take
both the child and parent iolock, so just telling lockdep about it
is enough.  The lock flag required for that was already added as
part of the ilock lockdep annotations and unused so far.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_filestream.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce569..9124425b7f2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
 	 * If the file's parent directory is known, take its iolock in exclusive
 	 * mode to prevent two sibling files from racing each other to migrate
 	 * themselves and their parent to different AGs.
+	 *
+	 * Note that we lock the parent directory iolock inside the child
+	 * iolock here.  That's fine as we never hold both parent and child
+	 * iolock in any other place.  This is different from the ilock,
+	 * which requires locking of the child after the parent for namespace
+	 * operations.
 	 */
 	if (pip)
-		xfs_ilock(pip, XFS_IOLOCK_EXCL);
+		xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 
 	/*
 	 * A new AG needs to be found for the file.  If the file's parent
-- 
cgit v1.2.3


From 5d2bf8a55e03b0e59ed5a4ac2ff7f9ee3ba7e40d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:42:56 +0000
Subject: xfs: fix a few compiler warnings with CONFIG_XFS_QUOTA=n

Andi Kleen reported that gcc-4.5 gives lots of warnings for him
inside the XFS code.  It turned out most of them are due to the
quota stubs beeing macros, and gcc now complaining about macros
evaluating to 0 that are not assigned to variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_quota.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd..9bb6eda4cd2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)	(0)
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)	(0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+		struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+	return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+		struct xfs_mount *mp, struct xfs_dquot *udqp,
+		struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+	return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)					(0)
 #define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)						(0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+	return 0;
+}
 #define xfs_qm_newmount(mp, a, b)					(0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)					(0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
-- 
cgit v1.2.3


From c6f6cd0608b1826ee1797cf57a808416e4bdb806 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 6 Nov 2010 11:43:08 +0000
Subject: xfs: use hlist_add_fake

XFS does not need it's inodes to actuall be hashed in the VFS inode
cache, but we require the inode to be marked hashed for the
writeback code to work.

Insted of using insert_inode_hash, which requires a second
inode_lock roundtrip after the partial merge of the inode
scalability patches in 2.6.37-rc simply use the new hlist_add_fake
helper to mark it hashed without requiring a lock or touching a
global cache line.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c6..94d5fd6a297 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -762,7 +762,8 @@ xfs_setup_inode(
 	inode->i_state = I_NEW;
 
 	inode_sb_list_add(inode);
-	insert_inode_hash(inode);
+	/* make the inode look hashed for the writeback code */
+	hlist_add_fake(&inode->i_hash);
 
 	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_nlink	= ip->i_d.di_nlink;
-- 
cgit v1.2.3


From ece413f59f257682de4a2e2e42af33b016af53f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 10 Nov 2010 21:39:11 +0000
Subject: xfs: remove incorrect assert in xfs_vm_writepage

In commit 20cb52ebd1b5ca6fa8a5d9b6b1392292f5ca8a45, titled
"xfs: simplify xfs_vm_writepage" I added an assert that any !mapped and
uptodate buffers are not dirty.  That asserts turns out to trigger a lot
when running fsx on filesystems with small block sizes.  The reason for
that is that the assert is simply incorrect.  !mapped and uptodate
just mean this buffer covers a hole, and whenever we do a set_page_dirty
we mark all blocks in the page dirty, no matter if they have data or
not.  So remove the assert, and update the comment above the condition
to match reality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd..7d287afccde 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1111,11 +1111,12 @@ xfs_vm_writepage(
 			uptodate = 0;
 
 		/*
-		 * A hole may still be marked uptodate because discard_buffer
-		 * leaves the flag set.
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			ASSERT(!buffer_dirty(bh));
 			imap_valid = 0;
 			continue;
 		}
-- 
cgit v1.2.3


From c726de4409a8d3a03877b1ef4342bfe8a15f5e5e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Nov 2010 15:14:39 +1100
Subject: xfs: fix failed write truncation handling.

Since the move to the new truncate sequence we call xfs_setattr to
truncate down excessively instanciated blocks.  As shown by the testcase
in kernel.org BZ #22452 that doesn't work too well.  Due to the confusion
of the internal inode size, and the VFS inode i_size it zeroes data that
it shouldn't.

But full blown truncate seems like overkill here.  We only instanciate
delayed allocations in the write path, and given that we never released
the iolock we can't have converted them to real allocations yet either.

The only nasty case is pre-existing preallocation which we need to skip.
We already do this for page discard during writeback, so make the delayed
allocation block punching a generic function and call it from the failed
write path as well as xfs_aops_discard_page. The callers are
responsible for ensuring that partial blocks are not truncated away,
and that they hold the ilock.

Based on a fix originally from Christoph Hellwig. This version used
filesystem blocks as the range unit.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_aops.c | 94 +++++++++++++++++++--------------------------
 fs/xfs/xfs_bmap.c           | 76 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h           |  5 +++
 3 files changed, 121 insertions(+), 54 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7d287afccde..691f61223ed 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct buffer_head	*bh, *head;
 	loff_t			offset = page_offset(page);
-	ssize_t			len = 1 << inode->i_blkbits;
 
 	if (!xfs_is_delayed_page(page, IO_DELAY))
 		goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	bh = head = page_buffers(page);
 	do {
-		int		done;
-		xfs_fileoff_t	offset_fsb;
-		xfs_bmbt_irec_t	imap;
-		int		nimaps = 1;
 		int		error;
-		xfs_fsblock_t	firstblock;
-		xfs_bmap_free_t flist;
+		xfs_fileoff_t	start_fsb;
 
 		if (!buffer_delay(bh))
 			goto next_buffer;
 
-		offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-
-		/*
-		 * Map the range first and check that it is a delalloc extent
-		 * before trying to unmap the range. Otherwise we will be
-		 * trying to remove a real extent (which requires a
-		 * transaction) or a hole, which is probably a bad idea...
-		 */
-		error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-				&nimaps, NULL);
-
-		if (error) {
-			/* something screwed, just bail */
-			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-				"page discard failed delalloc mapping lookup.");
-			}
-			break;
-		}
-		if (!nimaps) {
-			/* nothing there */
-			goto next_buffer;
-		}
-		if (imap.br_startblock != DELAYSTARTBLOCK) {
-			/* been converted, ignore */
-			goto next_buffer;
-		}
-		WARN_ON(imap.br_blockcount == 0);
-
-		/*
-		 * Note: while we initialise the firstblock/flist pair, they
-		 * should never be used because blocks should never be
-		 * allocated or freed for a delalloc extent and hence we need
-		 * don't cancel or finish them after the xfs_bunmapi() call.
-		 */
-		xfs_bmap_init(&flist, &firstblock);
-		error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-					&flist, &done);
-
-		ASSERT(!flist.xbf_count && !flist.xbf_first);
+		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
 			break;
 		}
 next_buffer:
-		offset += len;
+		offset += 1 << inode->i_blkbits;
 
 	} while ((bh = bh->b_this_page) != head);
 
@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
 	struct inode		*inode = mapping->host;
 
 	if (to > inode->i_size) {
-		struct iattr	ia = {
-			.ia_valid	= ATTR_SIZE | ATTR_FORCE,
-			.ia_size	= inode->i_size,
-		};
-		xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+		/*
+		 * punch out the delalloc blocks we have already allocated. We
+		 * don't call xfs_setattr() to do this as we may be in the
+		 * middle of a multi-iovec write and so the vfs inode->i_size
+		 * will not match the xfs ip->i_size and so it will zero too
+		 * much. Hence we jus truncate the page cache to zero what is
+		 * necessary and punch the delalloc blocks directly.
+		 */
+		struct xfs_inode	*ip = XFS_I(inode);
+		xfs_fileoff_t		start_fsb;
+		xfs_fileoff_t		end_fsb;
+		int			error;
+
+		truncate_pagecache(inode, to, inode->i_size);
+
+		/*
+		 * Check if there are any blocks that are outside of i_size
+		 * that need to be trimmed back.
+		 */
+		start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+		end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+		if (end_fsb <= start_fsb)
+			return;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+							end_fsb - start_fsb);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+			"xfs_vm_write_failed: unable to clean up ino %lld",
+						ip->i_ino);
+			}
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..08b179fa9e8 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6070,3 +6070,79 @@ xfs_bmap_disk_count_leaves(
 		*count += xfs_bmbt_disk_get_blockcount(frp);
 	}
 }
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length)
+{
+	xfs_fileoff_t		remaining = length;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	do {
+		int		done;
+		xfs_bmbt_irec_t	imap;
+		int		nimaps = 1;
+		xfs_fsblock_t	firstblock;
+		xfs_bmap_free_t flist;
+
+		/*
+		 * Map the range first and check that it is a delalloc extent
+		 * before trying to unmap the range. Otherwise we will be
+		 * trying to remove a real extent (which requires a
+		 * transaction) or a hole, which is probably a bad idea...
+		 */
+		error = xfs_bmapi(NULL, ip, start_fsb, 1,
+				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+				&nimaps, NULL);
+
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+			"Failed delalloc mapping lookup ino %lld fsb %lld.",
+						ip->i_ino, start_fsb);
+			}
+			break;
+		}
+		if (!nimaps) {
+			/* nothing there */
+			goto next_block;
+		}
+		if (imap.br_startblock != DELAYSTARTBLOCK) {
+			/* been converted, ignore */
+			goto next_block;
+		}
+		WARN_ON(imap.br_blockcount == 0);
+
+		/*
+		 * Note: while we initialise the firstblock/flist pair, they
+		 * should never be used because blocks should never be
+		 * allocated or freed for a delalloc extent and hence we need
+		 * don't cancel or finish them after the xfs_bunmapi() call.
+		 */
+		xfs_bmap_init(&flist, &firstblock);
+		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+					&flist, &done);
+		if (error)
+			break;
+
+		ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+		start_fsb++;
+		remaining--;
+	} while(remaining > 0);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
 	int			whichfork,
 	int			*count);
 
+int
+xfs_bmap_punch_delalloc_range(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		length);
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_BMAP_H__ */
-- 
cgit v1.2.3


From 90810b9e82a36c3c57c1aeb8b2918b242a130b26 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Nov 2010 15:16:16 +1100
Subject: xfs: push stale, pinned buffers on trylock failures

As reported by Nick Piggin, XFS is suffering from long pauses under
highly concurrent workloads when hosted on ramdisks. The problem is
that an inode buffer is stuck in the pinned state in memory and as a
result either the inode buffer or one of the inodes within the
buffer is stopping the tail of the log from being moved forward.

The system remains in this state until a periodic log force issued
by xfssyncd causes the buffer to be unpinned. The main problem is
that these are stale buffers, and are hence held locked until the
transaction/checkpoint that marked them state has been committed to
disk. When the filesystem gets into this state, only the xfssyncd
can cause the async transactions to be committed to disk and hence
unpin the inode buffer.

This problem was encountered when scaling the busy extent list, but
only the blocking lock interface was fixed to solve the problem.
Extend the same fix to the buffer trylock operations - if we fail to
lock a pinned, stale buffer, then force the log immediately so that
when the next attempt to lock it comes around, it will have been
unpinned.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1d353def2..4c5deb6e9e3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
 	spin_unlock(&pag->pag_buf_lock);
 	xfs_perag_put(pag);
 
-	/* Attempt to get the semaphore without sleeping,
-	 * if this does not work then we need to drop the
-	 * spinlock and do a hard attempt on the semaphore.
-	 */
-	if (down_trylock(&bp->b_sema)) {
+	if (xfs_buf_cond_lock(bp)) {
+		/* failed, so wait for the lock if requested. */
 		if (!(flags & XBF_TRYLOCK)) {
-			/* wait for buffer ownership */
 			xfs_buf_lock(bp);
 			XFS_STATS_INC(xb_get_locked_waited);
 		} else {
-			/* We asked for a trylock and failed, no need
-			 * to look at file offset and length here, we
-			 * know that this buffer at least overlaps our
-			 * buffer and is locked, therefore our buffer
-			 * either does not exist, or is this buffer.
-			 */
 			xfs_buf_rele(bp);
 			XFS_STATS_INC(xb_busy_locked);
 			return NULL;
 		}
-	} else {
-		/* trylock worked */
-		XB_SET_OWNER(bp);
 	}
 
 	if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
  */
 
 /*
- *	Locks a buffer object, if it is not already locked.
- *	Note that this in no way locks the underlying pages, so it is only
- *	useful for synchronizing concurrent use of buffer objects, not for
- *	synchronizing independent access to the underlying pages.
+ *	Locks a buffer object, if it is not already locked.  Note that this in
+ *	no way locks the underlying pages, so it is only useful for
+ *	synchronizing concurrent use of buffer objects, not for synchronizing
+ *	independent access to the underlying pages.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we are
+ *	being asked to lock a buffer that has been reallocated. Because it is
+ *	pinned, we know that the log has not been pushed to disk and hence it
+ *	will still be locked.  Rather than continuing to have trylock attempts
+ *	fail until someone else pushes the log, push it ourselves before
+ *	returning.  This means that the xfsaild will not get stuck trying
+ *	to push on stale inode buffers.
  */
 int
 xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
 	locked = down_trylock(&bp->b_sema) == 0;
 	if (locked)
 		XB_SET_OWNER(bp);
+	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_target->bt_mount, 0);
 
 	trace_xfs_buf_cond_lock(bp, _RET_IP_);
 	return locked ? 0 : -EBUSY;
-- 
cgit v1.2.3


From 309c848002052edbec650075a1eb098b17c17f35 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Nov 2010 15:16:02 +1100
Subject: xfs: delayed alloc blocks beyond EOF are valid after writeback

There is an assumption in the parts of XFS that flushing a dirty
file will make all the delayed allocation blocks disappear from an
inode. That is, that after calling xfs_flush_pages() then
ip->i_delayed_blks will be zero.

This is an invalid assumption as we may have specualtive
preallocation beyond EOF and they are recorded in
ip->i_delayed_blks. A flush of the dirty pages of an inode will not
change the state of these blocks beyond EOF, so a non-zero
deeelalloc block count after a flush is valid.

The bmap code has an invalid ASSERT() that needs to be removed, and
the swapext code has a bug in that while it swaps the data forks
around, it fails to swap the i_delayed_blks counter associated with
the fork and hence can get the block accounting wrong.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap.c  |  9 +++++++--
 fs/xfs/xfs_dfrag.c | 13 +++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 08b179fa9e8..4111cd3966c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
 			if (error)
 				goto out_unlock_iolock;
 		}
-
-		ASSERT(ip->i_delayed_blks == 0);
+		/*
+		 * even after flushing the inode, there can still be delalloc
+		 * blocks on the inode beyond EOF due to speculative
+		 * preallocation. These are not removed until the release
+		 * function is called or the inode is inactivated. Hence we
+		 * cannot assert here that ip->i_delayed_blks == 0.
+		 */
 	}
 
 	lock = xfs_ilock_map_shared(ip);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
 	ip->i_d.di_format = tip->i_d.di_format;
 	tip->i_d.di_format = tmp;
 
+	/*
+	 * The extents in the source inode could still contain speculative
+	 * preallocation beyond EOF (e.g. the file is open but not modified
+	 * while defrag is in progress). In that case, we need to copy over the
+	 * number of delalloc blocks the data fork in the source inode is
+	 * tracking beyond EOF so that when the fork is truncated away when the
+	 * temporary inode is unlinked we don't underrun the i_delayed_blks
+	 * counter on that inode.
+	 */
+	ASSERT(tip->i_delayed_blks == 0);
+	tip->i_delayed_blks = ip->i_delayed_blks;
+	ip->i_delayed_blks = 0;
+
 	ilf_fields = XFS_ILOG_CORE;
 
 	switch(ip->i_d.di_format) {
-- 
cgit v1.2.3


From de25c1818c44f580ff556cb9e0f7a1c687ed870b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Nov 2010 15:15:46 +1100
Subject: xfs: avoid moving stale inodes in the AIL

When an inode has been marked stale because the cluster is being
freed, we don't want to (re-)insert this inode into the AIL. There
is a race condition where the cluster buffer may be unpinned before
the inode is inserted into the AIL during transaction committed
processing. If the buffer is unpinned before the inode item has been
committed and inserted, then it is possible for the buffer to be
released and hence processthe stale inode callbacks before the inode
is inserted into the AIL.

In this case, we then insert a clean, stale inode into the AIL which
will never get removed by an IO completion. It will, however, get
reclaimed and that triggers an assert in xfs_inode_free()
complaining about freeing an inode still in the AIL.

This race can be avoided by not moving stale inodes forward in the AIL
during transaction commit completion processing. This closes the
race condition by ensuring we never insert clean stale inodes into
the AIL. It is safe to do this because a dirty stale inode, by
definition, must already be in the AIL.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_inode_item.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..7c8d30c453c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 
 /*
- * This is called to find out where the oldest active copy of the
- * inode log item in the on disk log resides now that the last log
- * write of it completed at the given lsn.  Since we always re-log
- * all dirty data in an inode, the latest copy in the on disk log
- * is the only one that matters.  Therefore, simply return the
- * given lsn.
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters.  Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
  */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		return lsn - 1;
 	return lsn;
 }
 
-- 
cgit v1.2.3


From c76febef574fd86566bbdf1a73a547a439115c25 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Nov 2010 15:15:31 +1100
Subject: xfs: only run xfs_error_test if error injection is active

Recent tests writing lots of small files showed the flusher thread
being CPU bound and taking a long time to do allocations on a debug
kernel. perf showed this as the prime reason:

             samples  pcnt function                    DSO
             _______ _____ ___________________________ _________________

           224648.00 36.8% xfs_error_test              [kernel.kallsyms]
            86045.00 14.1% xfs_btree_check_sblock      [kernel.kallsyms]
            39778.00  6.5% prandom32                   [kernel.kallsyms]
            37436.00  6.1% xfs_btree_increment         [kernel.kallsyms]
            29278.00  4.8% xfs_btree_get_rec           [kernel.kallsyms]
            27717.00  4.5% random32                    [kernel.kallsyms]

Walking btree blocks during allocation checking them requires each
block (a cache hit, so no I/O) call xfs_error_test(), which then
does a random32() call as the first operation.  IOWs, ~50% of the
CPU is being consumed just testing whether we need to inject an
error, even though error injection is not active.

Kill this overhead when error injection is not active by adding a
global counter of active error traps and only calling into
xfs_error_test when fault injection is active.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_error.c | 3 +++
 fs/xfs/xfs_error.h | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..c78cc6a3d87 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int	xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t	xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int	xfs_error_test_active;
 
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
 			len = strlen(mp->m_fsname);
 			xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
 			strcpy(xfs_etest_fsname[i], mp->m_fsname);
+			xfs_error_test_active++;
 			return 0;
 		}
 	}
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 			xfs_etest_fsid[i] = 0LL;
 			kmem_free(xfs_etest_fsname[i]);
 			xfs_etest_fsname[i] = NULL;
+			xfs_error_test_active--;
 		}
 	}
 
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..f338847f80b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
 
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 
 #define	XFS_NUM_INJECT_ERROR				10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)		\
-	((expr) || \
+	((expr) || (xfs_error_test_active && \
 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-			(rf)))
+			(rf))))
 
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
-- 
cgit v1.2.3


From 05340d4ab2ec2b6b4962c1c41c6ea8fb550f947b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 7 Dec 2010 10:16:41 +0000
Subject: xfs: log timestamp changes to the source inode in rename

Now that we don't mark VFS inodes dirty anymore for internal
timestamp changes, but rely on the transaction subsystem to push
them out, we need to explicitly log the source inode in rename after
updating it's timestamps to make sure the changes actually get
forced out by sync/fsync or an AIL push.

We already account for the fourth inode in the log reservation, as a
rename of directories needs to update the nlink field, so just
adding the xfs_trans_log_inode call is enough.

This fixes the xfsqa 065 regression introduced by:

	"xfs: don't use vfs writeback for pure metadata modifications"

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_rename.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
 	 * it and some incremental backup programs won't work without it.
 	 */
 	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
 
 	/*
 	 * Adjust the link count on src_dp.  This is necessary when
-- 
cgit v1.2.3