ceph: set i_head_snapc when getting CEPH_CAP_FILE_WR reference
In most cases that snap context is needed, we are holding
reference of CEPH_CAP_FILE_WR. So we can set ceph inode's
i_head_snapc when getting the CEPH_CAP_FILE_WR reference,
and make codes get snap context from i_head_snapc. This makes
the code simpler.
Another benefit of this change is that we can handle snap
notification more elegantly. Especially when snap context
is updated while someone else is doing write. The old queue
cap_snap code may set cap_snap's context to ether the old
context or the new snap context, depending on if i_head_snapc
is set. The new queue capp_snap code always set cap_snap's
context to the old snap context.
Signed-off-by: Yan, Zheng <zyan@redhat.com>
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7c8e93a..feb8ec9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2073,7 +2073,8 @@
*
* Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+ bool snap_rwsem_locked)
{
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
@@ -2081,8 +2082,14 @@
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
+ if (got & CEPH_CAP_FILE_WR) {
+ if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+ BUG_ON(!snap_rwsem_locked);
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
ci->i_wr_ref++;
+ }
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
@@ -2100,16 +2107,19 @@
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, int *check_max, int *err)
+ loff_t endoff, bool nonblock, int *got, int *err)
{
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
int file_wanted;
+ bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
+again:
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2125,6 +2135,10 @@
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked) {
+ up_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = false;
+ }
__ceph_do_pending_vmtruncate(inode);
spin_lock(&ci->i_ceph_lock);
}
@@ -2136,7 +2150,7 @@
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size) {
- *check_max = 1;
+ *err = -EAGAIN;
ret = 1;
}
goto out_unlock;
@@ -2164,8 +2178,29 @@
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
+ if (!snap_rwsem_locked &&
+ !ci->i_head_snapc &&
+ (need & CEPH_CAP_FILE_WR)) {
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ /*
+ * we can not call down_read() when
+ * task isn't in TASK_RUNNING state
+ */
+ if (nonblock) {
+ *err = -EAGAIN;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = true;
+ goto again;
+ }
+ snap_rwsem_locked = true;
+ }
*got = need | (have & want);
- __take_cap_refs(ci, *got);
+ __take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
@@ -2189,6 +2224,8 @@
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked)
+ up_read(&mdsc->snap_rwsem);
dout("get_cap_refs %p ret %d got %s\n", inode,
ret, ceph_cap_string(*got));
@@ -2231,54 +2268,70 @@
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int _got, check_max, ret, err = 0;
+ int _got, ret, err = 0;
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- _got = 0;
- check_max = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- &_got, &check_max, &err));
- if (err)
- ret = err;
- if (ret < 0)
- return ret;
+ while (true) {
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
- if (check_max)
- goto retry;
-
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(&ci->vfs_inode) > 0) {
- struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
+ err = 0;
_got = 0;
+ ret = try_get_cap_refs(ci, need, want, endoff,
+ false, &_got, &err);
+ if (ret) {
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ return err;
+ } else {
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ try_get_cap_refs(ci, need, want, endoff,
+ true, &_got, &err));
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ ret = err;
+ if (ret < 0)
+ return ret;
+ }
- /* getattr request will bring inline data into page cache */
- ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret < 0)
- return ret;
- goto retry;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page =
+ find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ break;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while
+ * holding * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /*
+ * getattr request will bring inline data into
+ * page cache
+ */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA,
+ true);
+ if (ret < 0)
+ return ret;
+ continue;
+ }
+ break;
}
-out:
+
*got = _got;
return 0;
}
@@ -2290,7 +2343,7 @@
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps);
+ __take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
@@ -2341,6 +2394,13 @@
wake = 1;
}
}
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
/* see comment in __ceph_remove_cap() */
if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
drop_inode_snap_realm(ci);
@@ -2384,7 +2444,9 @@
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2775,7 +2837,8 @@
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = 1;
- if (ci->i_wrbuffer_ref_head == 0) {
+ if (ci->i_wr_ref == 0 &&
+ ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;